aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-09-26 23:59:41 -0700
committerBryan Newbold <bnewbold@archive.org>2019-09-26 23:59:41 -0700
commit0da57660b034e051a45e84b18bd142f8dd6be927 (patch)
treeba31263259d4b85cb727999a61bc43a888825254
parent7eb019dcc158029a86c66b6035abb9f0076b9e45 (diff)
downloadsandcrawler-0da57660b034e051a45e84b18bd142f8dd6be927.tar.gz
sandcrawler-0da57660b034e051a45e84b18bd142f8dd6be927.zip
more counts and bugfixes in grobid_tool
-rwxr-xr-xpython/grobid_tool.py2
-rw-r--r--python/sandcrawler/grobid.py4
-rw-r--r--python/sandcrawler/workers.py6
3 files changed, 7 insertions, 5 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index e7a7e5c..352c2fb 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -48,7 +48,7 @@ def main():
default="dev",
help="Kafka topic namespace to use (eg, prod, qa, dev)")
parser.add_argument('-j', '--jobs',
- default=8,
+ default=8, type=int,
help="parallelism for batch CPU jobs")
parser.add_argument('--grobid-host',
default="http://grobid.qa.fatcat.wiki",
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 32addca..a610404 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -57,7 +57,6 @@ class GrobidWorker(SandcrawlerWorker):
self.consolidate_mode = 1
def process(self, record):
- self.counts['total'] += 1
if record.get('warc_path') and record.get('warc_offset'):
# it's a full CDX dict. fetch using WaybackClient
if not self.wayback_client:
@@ -82,7 +81,6 @@ class GrobidWorker(SandcrawlerWorker):
result['file_meta'] = gen_file_metadata(blob)
result['source'] = record
result['key'] = result['file_meta']['sha1hex']
- self.counts[result['status']] += 1
return result
class GrobidBlobWorker(SandcrawlerWorker):
@@ -98,11 +96,9 @@ class GrobidBlobWorker(SandcrawlerWorker):
self.consolidate_mode = 1
def process(self, blob):
- self.counts['total'] += 1
assert blob
result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']
- self.counts[result['status']] += 1
return result
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index e6f5d4b..e86d400 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -28,6 +28,9 @@ class SandcrawlerWorker(object):
if not result:
self.counts['failed'] += 1
return
+ elif type(result) == dict and 'status' in result and len(result['status']) < 32:
+ self.counts[result['status']] += 1
+
if self.sink:
self.sink.push_record(result)
self.counts['pushed'] += 1
@@ -63,6 +66,9 @@ class MultiprocessWrapper(SandcrawlerWorker):
if not result:
self.counts['failed'] += 1
return
+ elif type(result) == dict and 'status' in result and len(result['status']) < 32:
+ self.counts[result['status']] += 1
+
if self.sink:
self.sink.push_record(result)
self.counts['pushed'] += 1