aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-13 15:28:48 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-13 15:28:48 -0800
commit3370f203c3652ace357eeb69bb8828d830b3596a (patch)
treee283ade7600932b84605b84c852da01c2cd2dbdb /python/sandcrawler/persist.py
parent4aec6410c2318972240ded2bce5f68706aae18df (diff)
downloadsandcrawler-3370f203c3652ace357eeb69bb8828d830b3596a.tar.gz
sandcrawler-3370f203c3652ace357eeb69bb8828d830b3596a.zip
move pdf_trio results back under key in JSON/Kafka
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py10
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 64b2022..bfd8247 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -325,8 +325,16 @@ class PersistPdfTrioWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
- resp = self.db.insert_pdftrio(self.cur, batch)
+
+ pdftrio_batch = [r['pdf_trio'] for r in batch]
+ resp = self.db.insert_pdftrio(self.cur, pdftrio_batch)
self.counts['insert-pdftrio'] += resp[0]
self.counts['update-pdftrio'] += resp[1]
+
+ file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')]
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+ self.counts['insert-file-meta'] += resp[0]
+ self.counts['update-file-meta'] += resp[1]
+
self.db.commit()
return []