aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-12 19:40:55 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-12 19:42:43 -0800
commit94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch)
treeaf7803bee388beba7dd6dce2113e3632284537ac /python/sandcrawler/persist.py
parent6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff)
downloadsandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz
sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py21
1 files changed, 21 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 6469940..64b2022 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -309,3 +309,24 @@ class PersistGrobidDiskWorker(SandcrawlerWorker):
self.counts['written'] += 1
return record
+
+class PersistPdfTrioWorker(SandcrawlerWorker):
+
+ def __init__(self, db_url, **kwargs):
+ super().__init__()
+ self.db = SandcrawlerPostgresClient(db_url)
+ self.cur = self.db.conn.cursor()
+
+ def process(self, record):
+ """
+ Only do batches (as transactions)
+ """
+ raise NotImplementedError
+
+ def push_batch(self, batch):
+ self.counts['total'] += len(batch)
+ resp = self.db.insert_pdftrio(self.cur, batch)
+ self.counts['insert-pdftrio'] += resp[0]
+ self.counts['update-pdftrio'] += resp[1]
+ self.db.commit()
+ return []