aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-04-16 16:52:59 -0700
committerBryan Newbold <bnewbold@archive.org>2020-04-16 16:53:01 -0700
commit622c5bb1f9b6f4d773a31ead2fd9b14413a6fb00 (patch)
treefbfef8a75d410735c1b22d57ac49bbaae4f07ba7
parent83ca181637dfc34804649e1d342e3cb3ee59b5df (diff)
downloadsandcrawler-622c5bb1f9b6f4d773a31ead2fd9b14413a6fb00.tar.gz
sandcrawler-622c5bb1f9b6f4d773a31ead2fd9b14413a6fb00.zip
persist: only GROBID updates file_meta, not file-result
The hope here is to reduce deadlocks in production (on aitio). As context, we are only doing "updates" until the entire file_meta table is filled in with full metadata anyways; updates are wasteful of resources, and most inserts we have seen the file before, so should be doing "DO NOTHING" if the SHA1 is already in the table.
-rw-r--r--python/sandcrawler/persist.py2
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 379fd8b..f2a4893 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -191,7 +191,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
if file_meta_batch:
- resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing")
self.counts['insert-file_meta'] += resp[0]
self.counts['update-file_meta'] += resp[1]