From af051a2f401b97919d5e073f0962d4147fbfac8b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 19 Feb 2020 11:52:25 -0800 Subject: pdf_trio persist fixes from prod --- python/sandcrawler/db.py | 8 ++++---- python/sandcrawler/persist.py | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 673912c..03cc15f 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -219,10 +219,10 @@ class SandcrawlerPostgresClient: d['status'], d.get('versions', {}).get('pdftrio_version') or None, d.get('versions', {}).get('models_date') or None, - d.get('ensemble_score') or None, - d.get('bert_score') or None, - d.get('linear_score') or None, - d.get('image_score') or None, + d.get('ensemble_score'), + d.get('bert_score'), + d.get('linear_score'), + d.get('image_score'), ) for d in batch] # filter out duplicate rows by key (sha1hex) diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 3f2762a..88ac6b5 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -326,8 +326,12 @@ class PersistPdfTrioWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) + batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')] + for r in batch: + # copy key (sha1hex) into sub-object + r['pdf_trio']['key'] = r['key'] pdftrio_batch = [r['pdf_trio'] for r in batch] - resp = self.db.insert_pdftrio(self.cur, pdftrio_batch) + resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update") self.counts['insert-pdftrio'] += resp[0] self.counts['update-pdftrio'] += resp[1] -- cgit v1.2.3