diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-26 21:16:59 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | 293d4b176855d400324559c814abd2e404cdf31e (patch) | |
tree | e88b72109cc209fe422264deabdd1e7ca937add7 /python/sandcrawler/persist.py | |
parent | 9fda5323046cb3f87f0c7c7e07eca283ca52ce99 (diff) | |
download | sandcrawler-293d4b176855d400324559c814abd2e404cdf31e.tar.gz sandcrawler-293d4b176855d400324559c814abd2e404cdf31e.zip |
flush out minio helper, add to grobid persist
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r-- | python/sandcrawler/persist.py | 38 |
1 files changed, 29 insertions, 9 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 86a1c22..b017a82 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -19,6 +19,8 @@ grobid - file_meta SQL push batch (on conflict update) """ +import os + from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgresClient from sandcrawler.minio import SandcrawlerMinioClient @@ -187,6 +189,13 @@ class PersistGrobidWorker(SandcrawlerWorker): self.db = SandcrawlerPostgresClient(db_url) self.cur = self.db.conn.cursor() self.grobid = GrobidClient() + self.s3 = SandcrawlerMinioClient( + host_url=kwargs.get('s3_url', 'localhost:9000'), + access_key=kwargs['s3_access_key'], + secret_key=kwargs['s3_secret_key'], + default_bucket=kwargs['s3_bucket'], + ) + self.s3_only = kwargs.get('s3_only', False) def process(self, record): """ @@ -200,6 +209,7 @@ class PersistGrobidWorker(SandcrawlerWorker): # enhance with teixml2json metadata, if available for r in batch: if r['status_code'] != 200 or not r.get('tei_xml'): + self.counts['s3-skip'] += 1 continue metadata = self.grobid.metadata(r) if not metadata: @@ -212,15 +222,25 @@ class PersistGrobidWorker(SandcrawlerWorker): r['updated'] = metadata['grobid_timestamp'] r['metadata'] = metadata - grobid_batch = [r['grobid'] for r in batch if r.get('grobid')] - resp = self.db.insert_grobid(self.cur, batch, on_conflict="update") - self.counts['insert-grobid'] += resp - - file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')] - resp = self.db.insert_file_meta(self.cur, file_meta_batch) - self.counts['insert-file-meta'] += resp - - # TODO: minio + assert len(r['key']) == 40 + resp = self.s3.put_blob( + folder="grobid", + blob=r['tei_xml'], + sha1hex=r['key'], + extension=".tei.xml", + ) + self.counts['s3-put'] += 1 + + if not self.s3_only: + grobid_batch = [r['grobid'] for r in batch if r.get('grobid')] + resp = self.db.insert_grobid(self.cur, batch, on_conflict="update") + self.counts['insert-grobid'] += resp[0] + self.counts['update-grobid'] += resp[1] + + file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')] + resp = self.db.insert_file_meta(self.cur, file_meta_batch) + self.counts['insert-file-meta'] += resp[0] + self.counts['update-file-meta'] += resp[1] self.db.commit() return [] |