diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-27 18:04:51 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | be8c5e0c5d7a97cfd72973ace206aa39a1c58052 (patch) | |
tree | affeb508c965803db6dcc64226fa76c13380c07b /python | |
parent | 028a0c27a832833e8833e3b3d0e1d6725a48e953 (diff) | |
download | sandcrawler-be8c5e0c5d7a97cfd72973ace206aa39a1c58052.tar.gz sandcrawler-be8c5e0c5d7a97cfd72973ace206aa39a1c58052.zip |
improvements to grobid persist worker
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/persist.py | 29 |
1 files changed, 16 insertions, 13 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index ea54d6b..345e01a 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -195,7 +195,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')] if file_meta_batch: - resp = self.db.insert_file_meta(self.cur, file_meta_batch) + resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update") self.counts['insert-file_meta'] += resp[0] self.counts['update-file_meta'] += resp[1] @@ -227,11 +227,23 @@ class PersistGrobidWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) - # enhance with teixml2json metadata, if available for r in batch: if r['status_code'] != 200 or not r.get('tei_xml'): - self.counts['s3-skip'] += 1 + self.counts['s3-skip-status'] += 1 + if r.get('error_msg'): + r['metadata']['error_msg'] = r['error_msg'][:500] continue + + assert len(r['key']) == 40 + resp = self.s3.put_blob( + folder="grobid", + blob=r['tei_xml'], + sha1hex=r['key'], + extension=".tei.xml", + ) + self.counts['s3-put'] += 1 + + # enhance with teixml2json metadata, if available metadata = self.grobid.metadata(r) if not metadata: continue @@ -243,15 +255,6 @@ class PersistGrobidWorker(SandcrawlerWorker): r['updated'] = metadata['grobid_timestamp'] r['metadata'] = metadata - assert len(r['key']) == 40 - resp = self.s3.put_blob( - folder="grobid", - blob=r['tei_xml'], - sha1hex=r['key'], - extension=".tei.xml", - ) - self.counts['s3-put'] += 1 - if not self.s3_only: grobid_batch = [r['grobid'] for r in batch if r.get('grobid')] resp = self.db.insert_grobid(self.cur, batch, on_conflict="update") @@ -259,7 +262,7 @@ class PersistGrobidWorker(SandcrawlerWorker): self.counts['update-grobid'] += resp[1] file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')] - resp = self.db.insert_file_meta(self.cur, file_meta_batch) + resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update") self.counts['insert-file-meta'] += resp[0] self.counts['update-file-meta'] += resp[1] |