diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-19 11:52:25 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-19 11:52:25 -0800 |
commit | af051a2f401b97919d5e073f0962d4147fbfac8b (patch) | |
tree | 831cf5f8cbc50643d22d4c7faa21f6d4085fdbe5 | |
parent | e6f2a585868b0277145659b9d653a0288f76f5b6 (diff) | |
download | sandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.tar.gz sandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.zip |
pdf_trio persist fixes from prod
-rw-r--r-- | python/sandcrawler/db.py | 8 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 6 |
2 files changed, 9 insertions, 5 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 673912c..03cc15f 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -219,10 +219,10 @@ class SandcrawlerPostgresClient: d['status'], d.get('versions', {}).get('pdftrio_version') or None, d.get('versions', {}).get('models_date') or None, - d.get('ensemble_score') or None, - d.get('bert_score') or None, - d.get('linear_score') or None, - d.get('image_score') or None, + d.get('ensemble_score'), + d.get('bert_score'), + d.get('linear_score'), + d.get('image_score'), ) for d in batch] # filter out duplicate rows by key (sha1hex) diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 3f2762a..88ac6b5 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -326,8 +326,12 @@ class PersistPdfTrioWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) + batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')] + for r in batch: + # copy key (sha1hex) into sub-object + r['pdf_trio']['key'] = r['key'] pdftrio_batch = [r['pdf_trio'] for r in batch] - resp = self.db.insert_pdftrio(self.cur, pdftrio_batch) + resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update") self.counts['insert-pdftrio'] += resp[0] self.counts['update-pdftrio'] += resp[1] |