From 3370f203c3652ace357eeb69bb8828d830b3596a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 13 Feb 2020 15:28:48 -0800 Subject: move pdf_trio results back under key in JSON/Kafka --- python/sandcrawler/persist.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler/persist.py') diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 64b2022..bfd8247 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -325,8 +325,16 @@ class PersistPdfTrioWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) - resp = self.db.insert_pdftrio(self.cur, batch) + + pdftrio_batch = [r['pdf_trio'] for r in batch] + resp = self.db.insert_pdftrio(self.cur, pdftrio_batch) self.counts['insert-pdftrio'] += resp[0] self.counts['update-pdftrio'] += resp[1] + + file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')] + resp = self.db.insert_file_meta(self.cur, file_meta_batch) + self.counts['insert-file-meta'] += resp[0] + self.counts['update-file-meta'] += resp[1] + self.db.commit() return [] -- cgit v1.2.3