From 622c5bb1f9b6f4d773a31ead2fd9b14413a6fb00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 16 Apr 2020 16:52:59 -0700 Subject: persist: only GROBID updates file_meta, not file-result The hope here is to reduce deadlocks in production (on aitio). As context, we are only doing "updates" until the entire file_meta table is filled in with full metadata anyways; updates are wasteful of resources, and most inserts we have seen the file before, so should be doing "DO NOTHING" if the SHA1 is already in the table. --- python/sandcrawler/persist.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 379fd8b..f2a4893 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -191,7 +191,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')] if file_meta_batch: - resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update") + resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="nothing") self.counts['insert-file_meta'] += resp[0] self.counts['update-file_meta'] += resp[1] -- cgit v1.2.3