aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-10-04 14:20:05 -0700
committerBryan Newbold <bnewbold@archive.org>2019-10-04 14:20:05 -0700
commit1a9df830d128cbd0bf80ff585785e226a6cb9019 (patch)
tree6c0b47953d87171688fdeed1b566ff5dc4449209 /python
parentee797ddc0a1377423cfe1939634e6d019eecea9e (diff)
downloadsandcrawler-1a9df830d128cbd0bf80ff585785e226a6cb9019.tar.gz
sandcrawler-1a9df830d128cbd0bf80ff585785e226a6cb9019.zip
grobid_tool: don't always insert multi wrapper
Diffstat (limited to 'python')
-rwxr-xr-xpython/grobid_tool.py19
1 files changed, 13 insertions, 6 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index 3533f43..9af0ab2 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -22,18 +22,25 @@ def run_extract_json(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
worker = GrobidWorker(grobid_client, wayback_client, sink=None)
- multi_worker = MultiprocessWrapper(worker, args.sink)
- pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ if args.jobs > 1:
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = JsonLinePusher(multi_worker, args.json_file, batch_size=args.jobs)
+ else:
+ pusher = JsonLinePusher(worker, args.json_file)
pusher.run()
def run_extract_cdx(args):
grobid_client = GrobidClient(host_url=args.grobid_host)
wayback_client = WaybackClient()
worker = GrobidWorker(grobid_client, wayback_client, sink=None)
- multi_worker = MultiprocessWrapper(worker, args.sink)
- pusher = CdxLinePusher(multi_worker, args.cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'],
- batch_size=args.jobs)
+ if args.jobs > 1:
+ multi_worker = MultiprocessWrapper(worker, args.sink)
+ pusher = CdxLinePusher(multi_worker, args.cdx_file,
+ filter_http_statuses=[200], filter_mimetypes=['application/pdf'],
+ batch_size=args.jobs)
+ else:
+ pusher = CdxLinePusher(worker, args.cdx_file,
+ filter_http_statuses=[200], filter_mimetypes=['application/pdf'])
pusher.run()
def run_extract_zipfile(args):