diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 14:20:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-10-04 14:20:05 -0700 |
commit | 1a9df830d128cbd0bf80ff585785e226a6cb9019 (patch) | |
tree | 6c0b47953d87171688fdeed1b566ff5dc4449209 /python | |
parent | ee797ddc0a1377423cfe1939634e6d019eecea9e (diff) | |
download | sandcrawler-1a9df830d128cbd0bf80ff585785e226a6cb9019.tar.gz sandcrawler-1a9df830d128cbd0bf80ff585785e226a6cb9019.zip |
grobid_tool: don't always insert multi wrapper
Diffstat (limited to 'python')
-rwxr-xr-x | python/grobid_tool.py | 19 |
1 files changed, 13 insertions, 6 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index 3533f43..9af0ab2 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -22,18 +22,25 @@ def run_extract_json(args): grobid_client = GrobidClient(host_url=args.grobid_host) wayback_client = WaybackClient() worker = GrobidWorker(grobid_client, wayback_client, sink=None) - multi_worker = MultiprocessWrapper(worker, args.sink) - pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) + if args.jobs > 1: + multi_worker = MultiprocessWrapper(worker, args.sink) + pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs) + else: + pusher = JsonLinePusher(worker, args.json_file) pusher.run() def run_extract_cdx(args): grobid_client = GrobidClient(host_url=args.grobid_host) wayback_client = WaybackClient() worker = GrobidWorker(grobid_client, wayback_client, sink=None) - multi_worker = MultiprocessWrapper(worker, args.sink) - pusher = CdxLinePusher(multi_worker, args.cdx_file, - filter_http_statuses=[200], filter_mimetypes=['application/pdf'], - batch_size=args.jobs) + if args.jobs > 1: + multi_worker = MultiprocessWrapper(worker, args.sink) + pusher = CdxLinePusher(multi_worker, args.cdx_file, + filter_http_statuses=[200], filter_mimetypes=['application/pdf'], + batch_size=args.jobs) + else: + pusher = CdxLinePusher(worker, args.cdx_file, + filter_http_statuses=[200], filter_mimetypes=['application/pdf']) pusher.run() def run_extract_zipfile(args): |