From ba6f16a02cfde0e4acb499c00b456b42472c0b00 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 14 Jan 2020 16:05:41 -0800 Subject: more ftp status 226 support --- python/grobid_tool.py | 18 +++++++++++++----- python/persist_tool.py | 2 +- python/tests/test_grobid.py | 8 ++++++-- python/tests/test_pushers.py | 2 +- python/tests/test_wayback.py | 2 ++ 5 files changed, 23 insertions(+), 9 deletions(-) diff --git a/python/grobid_tool.py b/python/grobid_tool.py index ad7841d..dc9780d 100755 --- a/python/grobid_tool.py +++ b/python/grobid_tool.py @@ -36,13 +36,21 @@ def run_extract_cdx(args): if args.jobs > 1: worker = GrobidWorker(grobid_client, wayback_client, sink=None) multi_worker = MultiprocessWrapper(worker, args.sink) - pusher = CdxLinePusher(multi_worker, args.cdx_file, - filter_http_statuses=[200], filter_mimetypes=['application/pdf'], - batch_size=args.jobs) + pusher = CdxLinePusher( + multi_worker, + args.cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + batch_size=args.jobs, + ) else: worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink) - pusher = CdxLinePusher(worker, args.cdx_file, - filter_http_statuses=[200], filter_mimetypes=['application/pdf']) + pusher = CdxLinePusher( + worker, + args.cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) pusher.run() def run_extract_zipfile(args): diff --git a/python/persist_tool.py b/python/persist_tool.py index 29345e2..a91d4c3 100755 --- a/python/persist_tool.py +++ b/python/persist_tool.py @@ -27,7 +27,7 @@ def run_cdx(args): pusher = CdxLinePusher( worker, args.cdx_file, - filter_http_statuses=[200], + filter_http_statuses=[200, 226], filter_mimetypes=filter_mimetypes, #allow_octet_stream batch_size=200, diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py index 8c5e080..24ea40e 100644 --- a/python/tests/test_grobid.py +++ b/python/tests/test_grobid.py @@ -64,8 +64,12 @@ def test_grobid_worker_cdx(grobid_client, wayback_client): body=REAL_TEI_XML, content_type='text/xml') with open('tests/files/example.cdx', 'r') as cdx_file: - pusher = CdxLinePusher(worker, cdx_file, - filter_http_statuses=[200], filter_mimetypes=['application/pdf']) + pusher = CdxLinePusher( + worker, + cdx_file, + filter_http_statuses=[200, 226], + filter_mimetypes=['application/pdf'], + ) pusher_counts = pusher.run() assert pusher_counts['total'] assert pusher_counts['pushed'] == 7 diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py index ed9c0bb..52f26c0 100644 --- a/python/tests/test_pushers.py +++ b/python/tests/test_pushers.py @@ -19,7 +19,7 @@ def test_cdx_line_pusher(): # HTTP 200 and application/pdf with open('tests/files/example.cdx', 'r') as cdx_file: pusher = CdxLinePusher(sink, cdx_file, - filter_mimetypes=['application/pdf'], filter_http_statuses=[200]) + filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226]) counts = pusher.run() assert counts['total'] == 20 assert counts['skip-parse'] == 1 diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py index efffbe2..6bc1ca4 100644 --- a/python/tests/test_wayback.py +++ b/python/tests/test_wayback.py @@ -119,6 +119,7 @@ def wayback_client(cdx_client, mocker): resource = mocker.Mock() client.rstore.load_resource = mocker.MagicMock(return_value=resource) resource.get_status = mocker.MagicMock(return_value=(200, "Ok")) + resource.is_revisit = mocker.MagicMock(return_value=False) resource.get_location = mocker.MagicMock(return_value=WARC_TARGET) body = mocker.Mock() resource.open_raw_content = mocker.MagicMock(return_value=body) @@ -141,6 +142,7 @@ def wayback_client_pdf(cdx_client, mocker): resource = mocker.Mock() client.rstore.load_resource = mocker.MagicMock(return_value=resource) resource.get_status = mocker.MagicMock(return_value=(200, "Ok")) + resource.is_revisit = mocker.MagicMock(return_value=False) resource.get_location = mocker.MagicMock(return_value=WARC_TARGET) body = mocker.Mock() resource.open_raw_content = mocker.MagicMock(return_value=body) -- cgit v1.2.3