From ba6f16a02cfde0e4acb499c00b456b42472c0b00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 14 Jan 2020 16:05:41 -0800 Subject: more ftp status 226 support --- python/grobid_tool.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'python/grobid_tool.py') diff --git a/python/grobid_tool.py b/python/grobid_tool.py index ad7841d..dc9780d 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -36,13 +36,21 @@ def run_extract_cdx(args): if args.jobs > 1: worker = GrobidWorker(grobid_client, wayback_client, sink=None) multi_worker = MultiprocessWrapper(worker, args.sink) - pusher = CdxLinePusher(multi_worker, args.cdx_file, - filter_http_statuses=[200], filter_mimetypes=['application/pdf'], - batch_size=args.jobs) + pusher = CdxLinePusher( + multi_worker, + args.cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + batch_size=args.jobs, + ) else: worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink) - pusher = CdxLinePusher(worker, args.cdx_file, - filter_http_statuses=[200], filter_mimetypes=['application/pdf']) + pusher = CdxLinePusher( + worker, + args.cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) pusher.run() def run_extract_zipfile(args): -- cgit v1.2.3