aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-19 11:52:25 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-19 11:52:25 -0800
commitaf051a2f401b97919d5e073f0962d4147fbfac8b (patch)
tree831cf5f8cbc50643d22d4c7faa21f6d4085fdbe5 /python/sandcrawler/persist.py
parente6f2a585868b0277145659b9d653a0288f76f5b6 (diff)
downloadsandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.tar.gz
sandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.zip
pdf_trio persist fixes from prod
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 3f2762a..88ac6b5 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -326,8 +326,12 @@ class PersistPdfTrioWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
+ batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')]
+ for r in batch:
+ # copy key (sha1hex) into sub-object
+ r['pdf_trio']['key'] = r['key']
pdftrio_batch = [r['pdf_trio'] for r in batch]
- resp = self.db.insert_pdftrio(self.cur, pdftrio_batch)
+ resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
self.counts['insert-pdftrio'] += resp[0]
self.counts['update-pdftrio'] += resp[1]