diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-19 11:52:25 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-19 11:52:25 -0800 |
commit | af051a2f401b97919d5e073f0962d4147fbfac8b (patch) | |
tree | 831cf5f8cbc50643d22d4c7faa21f6d4085fdbe5 /python/sandcrawler/persist.py | |
parent | e6f2a585868b0277145659b9d653a0288f76f5b6 (diff) | |
download | sandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.tar.gz sandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.zip |
pdf_trio persist fixes from prod
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 3f2762a..88ac6b5 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -326,8 +326,12 @@ class PersistPdfTrioWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) + batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')] + for r in batch: + # copy key (sha1hex) into sub-object + r['pdf_trio']['key'] = r['key'] pdftrio_batch = [r['pdf_trio'] for r in batch] - resp = self.db.insert_pdftrio(self.cur, pdftrio_batch) + resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update") self.counts['insert-pdftrio'] += resp[0] self.counts['update-pdftrio'] += resp[1] |