diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:40:55 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-12 19:42:43 -0800 |
commit | 94912e739c51d2fa4d5f9de878d0b0f0544a4459 (patch) | |
tree | af7803bee388beba7dd6dce2113e3632284537ac /python/sandcrawler/persist.py | |
parent | 6b3ce3169847a16fe6c0ab00f3a8af8b8ad099ab (diff) | |
download | sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.tar.gz sandcrawler-94912e739c51d2fa4d5f9de878d0b0f0544a4459.zip |
pdftrio basic python code
This is basically just a copy/paste of GROBID code, only simpler!
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 21 |
1 files changed, 21 insertions, 0 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 6469940..64b2022 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -309,3 +309,24 @@ class PersistGrobidDiskWorker(SandcrawlerWorker): self.counts['written'] += 1 return record + +class PersistPdfTrioWorker(SandcrawlerWorker): + + def __init__(self, db_url, **kwargs): + super().__init__() + self.db = SandcrawlerPostgresClient(db_url) + self.cur = self.db.conn.cursor() + + def process(self, record): + """ + Only do batches (as transactions) + """ + raise NotImplementedError + + def push_batch(self, batch): + self.counts['total'] += len(batch) + resp = self.db.insert_pdftrio(self.cur, batch) + self.counts['insert-pdftrio'] += resp[0] + self.counts['update-pdftrio'] += resp[1] + self.db.commit() + return [] |