From 94912e739c51d2fa4d5f9de878d0b0f0544a4459 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 12 Feb 2020 19:40:55 -0800 Subject: pdftrio basic python code This is basically just a copy/paste of GROBID code, only simpler! --- python/sandcrawler/persist.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'python/sandcrawler/persist.py') diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 6469940..64b2022 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -309,3 +309,24 @@ class PersistGrobidDiskWorker(SandcrawlerWorker): self.counts['written'] += 1 return record + +class PersistPdfTrioWorker(SandcrawlerWorker): + + def __init__(self, db_url, **kwargs): + super().__init__() + self.db = SandcrawlerPostgresClient(db_url) + self.cur = self.db.conn.cursor() + + def process(self, record): + """ + Only do batches (as transactions) + """ + raise NotImplementedError + + def push_batch(self, batch): + self.counts['total'] += len(batch) + resp = self.db.insert_pdftrio(self.cur, batch) + self.counts['insert-pdftrio'] += resp[0] + self.counts['update-pdftrio'] += resp[1] + self.db.commit() + return [] -- cgit v1.2.3