aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-26 15:21:00 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-26 15:21:00 -0700
commit71756038bc376568f7bcf124b6f8a23fc9221594 (patch)
tree5b3aefb381d5071cf860153e35dd8e15fc6620da /python/sandcrawler/grobid.py
parent37bf997dc0220a30605249655056e90f04e33366 (diff)
downloadsandcrawler-71756038bc376568f7bcf124b6f8a23fc9221594.tar.gz
sandcrawler-71756038bc376568f7bcf124b6f8a23fc9221594.zip
small improvements to GROBID tool
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index a610404..32addca 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -57,6 +57,7 @@ class GrobidWorker(SandcrawlerWorker):
self.consolidate_mode = 1
def process(self, record):
+ self.counts['total'] += 1
if record.get('warc_path') and record.get('warc_offset'):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
@@ -81,6 +82,7 @@ class GrobidWorker(SandcrawlerWorker):
result['file_meta'] = gen_file_metadata(blob)
result['source'] = record
result['key'] = result['file_meta']['sha1hex']
+ self.counts[result['status']] += 1
return result
class GrobidBlobWorker(SandcrawlerWorker):
@@ -96,9 +98,11 @@ class GrobidBlobWorker(SandcrawlerWorker):
self.consolidate_mode = 1
def process(self, blob):
+ self.counts['total'] += 1
assert blob
result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']
+ self.counts[result['status']] += 1
return result