diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-16 13:46:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-16 13:46:30 -0700 |
commit | 83ca181637dfc34804649e1d342e3cb3ee59b5df (patch) | |
tree | a9a46b42a9ee0d2917f95159c1b9d12392f9e5cf /python/grobid_tool.py | |
parent | 7243649b0171c0c02bda41ea57626ed4c0f59db0 (diff) | |
download | sandcrawler-83ca181637dfc34804649e1d342e3cb3ee59b5df.tar.gz sandcrawler-83ca181637dfc34804649e1d342e3cb3ee59b5df.zip |
batch/multiprocess for ZipfilePusher
Diffstat (limited to 'python/grobid_tool.py')
-rwxr-xr-x | python/grobid_tool.py | 10 |
1 files changed, 8 insertions, 2 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py index dc9780d..a2d74a1 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -55,8 +55,14 @@ def run_extract_cdx(args): def run_extract_zipfile(args): grobid_client = GrobidClient(host_url=args.grobid_host) - worker = GrobidBlobWorker(grobid_client, sink=args.sink) - pusher = ZipfilePusher(worker, args.zip_file) + if args.jobs > 1: + print("multi-processing: {}".format(args.jobs), file=sys.stderr) + worker = GrobidBlobWorker(grobid_client, sink=None) + multi_worker = MultiprocessWrapper(worker, args.sink, jobs=args.jobs) + pusher = ZipfilePusher(multi_worker, args.zip_file, batch_size=args.jobs) + else: + worker = GrobidBlobWorker(grobid_client, sink=args.sink) + pusher = ZipfilePusher(worker, args.zip_file) pusher.run() def run_transform(args): |