diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-13 15:28:48 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-13 15:28:48 -0800 |
commit | 3370f203c3652ace357eeb69bb8828d830b3596a (patch) | |
tree | e283ade7600932b84605b84c852da01c2cd2dbdb /python/sandcrawler/persist.py | |
parent | 4aec6410c2318972240ded2bce5f68706aae18df (diff) | |
download | sandcrawler-3370f203c3652ace357eeb69bb8828d830b3596a.tar.gz sandcrawler-3370f203c3652ace357eeb69bb8828d830b3596a.zip |
move pdf_trio results back under key in JSON/Kafka
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 10 |
1 files changed, 9 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 64b2022..bfd8247 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -325,8 +325,16 @@ class PersistPdfTrioWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) - resp = self.db.insert_pdftrio(self.cur, batch) + + pdftrio_batch = [r['pdf_trio'] for r in batch] + resp = self.db.insert_pdftrio(self.cur, pdftrio_batch) self.counts['insert-pdftrio'] += resp[0] self.counts['update-pdftrio'] += resp[1] + + file_meta_batch = [r['file_meta'] for r in batch if r['pdf_trio']['status'] == "success" and r.get('file_meta')] + resp = self.db.insert_file_meta(self.cur, file_meta_batch) + self.counts['insert-file-meta'] += resp[0] + self.counts['update-file-meta'] += resp[1] + self.db.commit() return [] |