aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-19 11:52:25 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-19 11:52:25 -0800
commitaf051a2f401b97919d5e073f0962d4147fbfac8b (patch)
tree831cf5f8cbc50643d22d4c7faa21f6d4085fdbe5
parente6f2a585868b0277145659b9d653a0288f76f5b6 (diff)
downloadsandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.tar.gz
sandcrawler-af051a2f401b97919d5e073f0962d4147fbfac8b.zip
pdf_trio persist fixes from prod
-rw-r--r--python/sandcrawler/db.py8
-rw-r--r--python/sandcrawler/persist.py6
2 files changed, 9 insertions, 5 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 673912c..03cc15f 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -219,10 +219,10 @@ class SandcrawlerPostgresClient:
d['status'],
d.get('versions', {}).get('pdftrio_version') or None,
d.get('versions', {}).get('models_date') or None,
- d.get('ensemble_score') or None,
- d.get('bert_score') or None,
- d.get('linear_score') or None,
- d.get('image_score') or None,
+ d.get('ensemble_score'),
+ d.get('bert_score'),
+ d.get('linear_score'),
+ d.get('image_score'),
)
for d in batch]
# filter out duplicate rows by key (sha1hex)
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 3f2762a..88ac6b5 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -326,8 +326,12 @@ class PersistPdfTrioWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
+ batch = [r for r in batch if 'pdf_trio' in r and r['pdf_trio'].get('status_code')]
+ for r in batch:
+ # copy key (sha1hex) into sub-object
+ r['pdf_trio']['key'] = r['key']
pdftrio_batch = [r['pdf_trio'] for r in batch]
- resp = self.db.insert_pdftrio(self.cur, pdftrio_batch)
+ resp = self.db.insert_pdftrio(self.cur, pdftrio_batch, on_conflict="update")
self.counts['insert-pdftrio'] += resp[0]
self.counts['update-pdftrio'] += resp[1]