aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-27 18:04:51 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commitbe8c5e0c5d7a97cfd72973ace206aa39a1c58052 (patch)
treeaffeb508c965803db6dcc64226fa76c13380c07b
parent028a0c27a832833e8833e3b3d0e1d6725a48e953 (diff)
downloadsandcrawler-be8c5e0c5d7a97cfd72973ace206aa39a1c58052.tar.gz
sandcrawler-be8c5e0c5d7a97cfd72973ace206aa39a1c58052.zip
improvements to grobid persist worker
-rw-r--r--python/sandcrawler/persist.py29
1 files changed, 16 insertions, 13 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index ea54d6b..345e01a 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -195,7 +195,7 @@ class PersistIngestFileResultWorker(SandcrawlerWorker):
file_meta_batch = [r['file_meta'] for r in batch if r.get('hit') and r.get('file_meta')]
if file_meta_batch:
- resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
self.counts['insert-file_meta'] += resp[0]
self.counts['update-file_meta'] += resp[1]
@@ -227,11 +227,23 @@ class PersistGrobidWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
- # enhance with teixml2json metadata, if available
for r in batch:
if r['status_code'] != 200 or not r.get('tei_xml'):
- self.counts['s3-skip'] += 1
+ self.counts['s3-skip-status'] += 1
+ if r.get('error_msg'):
+ r['metadata']['error_msg'] = r['error_msg'][:500]
continue
+
+ assert len(r['key']) == 40
+ resp = self.s3.put_blob(
+ folder="grobid",
+ blob=r['tei_xml'],
+ sha1hex=r['key'],
+ extension=".tei.xml",
+ )
+ self.counts['s3-put'] += 1
+
+ # enhance with teixml2json metadata, if available
metadata = self.grobid.metadata(r)
if not metadata:
continue
@@ -243,15 +255,6 @@ class PersistGrobidWorker(SandcrawlerWorker):
r['updated'] = metadata['grobid_timestamp']
r['metadata'] = metadata
- assert len(r['key']) == 40
- resp = self.s3.put_blob(
- folder="grobid",
- blob=r['tei_xml'],
- sha1hex=r['key'],
- extension=".tei.xml",
- )
- self.counts['s3-put'] += 1
-
if not self.s3_only:
grobid_batch = [r['grobid'] for r in batch if r.get('grobid')]
resp = self.db.insert_grobid(self.cur, batch, on_conflict="update")
@@ -259,7 +262,7 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['update-grobid'] += resp[1]
file_meta_batch = [r['file_meta'] for r in batch if r.get('file_meta')]
- resp = self.db.insert_file_meta(self.cur, file_meta_batch)
+ resp = self.db.insert_file_meta(self.cur, file_meta_batch, on_conflict="update")
self.counts['insert-file-meta'] += resp[0]
self.counts['update-file-meta'] += resp[1]