diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 15:28:00 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 15:28:00 -0700 |
commit | fdc013ee8f7ffd04baa2b1c61b5def0deff6ebe0 (patch) | |
tree | d805dc2b3a7dc4c61a9c2e2f88e90ec713231e88 | |
parent | 055c51a6c4f3a920c4a7eecb5093ffa6e4b64f72 (diff) | |
download | sandcrawler-fdc013ee8f7ffd04baa2b1c61b5def0deff6ebe0.tar.gz sandcrawler-fdc013ee8f7ffd04baa2b1c61b5def0deff6ebe0.zip |
grobid_tool: don't wrap multiprocess if we don't need to
-rwxr-xr-x | python/grobid_tool.py | 6 |
1 files changed, 4 insertions, 2 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index e787cdf..f21d088 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -19,24 +19,26 @@ from sandcrawler import * def run_extract_json(args): grobid_client = GrobidClient(host_url=args.grobid_host) wayback_client = WaybackClient() - worker = GrobidWorker(grobid_client, wayback_client, sink=None) if args.jobs > 1: + worker = GrobidWorker(grobid_client, wayback_client, sink=None) multi_worker = MultiprocessWrapper(worker, args.sink) pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) else: + worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink) pusher = JsonLinePusher(worker, args.json_file) pusher.run() def run_extract_cdx(args): grobid_client = GrobidClient(host_url=args.grobid_host) wayback_client = WaybackClient() - worker = GrobidWorker(grobid_client, wayback_client, sink=None) if args.jobs > 1: + worker = GrobidWorker(grobid_client, wayback_client, sink=None) multi_worker = MultiprocessWrapper(worker, args.sink) pusher = CdxLinePusher(multi_worker, args.cdx_file, filter_http_statuses=[200], filter_mimetypes=['application/pdf'], batch_size=args.jobs) else: + worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink) pusher = CdxLinePusher(worker, args.cdx_file, filter_http_statuses=[200], filter_mimetypes=['application/pdf']) pusher.run() |