aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-04 15:28:00 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-04 15:28:00 -0700
commitfdc013ee8f7ffd04baa2b1c61b5def0deff6ebe0 (patch)
treed805dc2b3a7dc4c61a9c2e2f88e90ec713231e88
parent055c51a6c4f3a920c4a7eecb5093ffa6e4b64f72 (diff)
downloadsandcrawler-fdc013ee8f7ffd04baa2b1c61b5def0deff6ebe0.tar.gz
sandcrawler-fdc013ee8f7ffd04baa2b1c61b5def0deff6ebe0.zip
grobid_tool: don't wrap multiprocess if we don't need to
-rwxr-xr-xpython/grobid_tool.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index e787cdf..f21d088 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -19,24 +19,26 @@ from sandcrawler import *
def run_extract_json(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
- worker = GrobidWorker(grobid_client, wayback_client, sink=None)
if args.jobs > 1:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=None)
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
else:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink)
pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
def run_extract_cdx(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
- worker = GrobidWorker(grobid_client, wayback_client, sink=None)
if args.jobs > 1:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=None)
multi_worker = MultiprocessWrapper(worker, args.sink)
pusher = CdxLinePusher(multi_worker, args.cdx_file,
filter_http_statuses=[200], filter_mimetypes=['application/pdf'],
batch_size=args.jobs)
else:
+ worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink)
pusher = CdxLinePusher(worker, args.cdx_file,
filter_http_statuses=[200], filter_mimetypes=['application/pdf'])
pusher.run()