diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-09-26 23:59:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-09-26 23:59:41 -0700 |
commit | 0da57660b034e051a45e84b18bd142f8dd6be927 (patch) | |
tree | ba31263259d4b85cb727999a61bc43a888825254 | |
parent | 7eb019dcc158029a86c66b6035abb9f0076b9e45 (diff) | |
download | sandcrawler-0da57660b034e051a45e84b18bd142f8dd6be927.tar.gz sandcrawler-0da57660b034e051a45e84b18bd142f8dd6be927.zip |
more counts and bugfixes in grobid_tool
-rwxr-xr-x | python/grobid_tool.py | 2 | ||||
-rw-r--r-- | python/sandcrawler/grobid.py | 4 | ||||
-rw-r--r-- | python/sandcrawler/workers.py | 6 |
3 files changed, 7 insertions, 5 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index e7a7e5c..352c2fb 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -48,7 +48,7 @@ def main(): default="dev", help="Kafka topic namespace to use (eg, prod, qa, dev)") parser.add_argument('-j', '--jobs', - default=8, + default=8, type=int, help="parallelism for batch CPU jobs") parser.add_argument('--grobid-host', default="http://grobid.qa.fatcat.wiki", diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 32addca..a610404 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -57,7 +57,6 @@ class GrobidWorker(SandcrawlerWorker): self.consolidate_mode = 1 def process(self, record): - self.counts['total'] += 1 if record.get('warc_path') and record.get('warc_offset'): # it's a full CDX dict. fetch using WaybackClient if not self.wayback_client: @@ -82,7 +81,6 @@ class GrobidWorker(SandcrawlerWorker): result['file_meta'] = gen_file_metadata(blob) result['source'] = record result['key'] = result['file_meta']['sha1hex'] - self.counts[result['status']] += 1 return result class GrobidBlobWorker(SandcrawlerWorker): @@ -98,11 +96,9 @@ class GrobidBlobWorker(SandcrawlerWorker): self.consolidate_mode = 1 def process(self, blob): - self.counts['total'] += 1 assert blob result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) result['file_meta'] = gen_file_metadata(blob) result['key'] = result['file_meta']['sha1hex'] - self.counts[result['status']] += 1 return result diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index e6f5d4b..e86d400 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -28,6 +28,9 @@ class SandcrawlerWorker(object): if not result: self.counts['failed'] += 1 return + elif type(result) == dict and 'status' in result and len(result['status']) < 32: + self.counts[result['status']] += 1 + if self.sink: self.sink.push_record(result) self.counts['pushed'] += 1 @@ -63,6 +66,9 @@ class MultiprocessWrapper(SandcrawlerWorker): if not result: self.counts['failed'] += 1 return + elif type(result) == dict and 'status' in result and len(result['status']) < 32: + self.counts[result['status']] += 1 + if self.sink: self.sink.push_record(result) self.counts['pushed'] += 1 |