aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 16:05:41 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 16:05:41 -0800
commitba6f16a02cfde0e4acb499c00b456b42472c0b00 (patch)
tree9b27d9982e61ee97a82181249c0f9723980a4beb
parent21599839802b8ef3a84ffe90855f7bceaaa12a0d (diff)
downloadsandcrawler-ba6f16a02cfde0e4acb499c00b456b42472c0b00.tar.gz
sandcrawler-ba6f16a02cfde0e4acb499c00b456b42472c0b00.zip
more ftp status 226 support
-rwxr-xr-xpython/grobid_tool.py18
-rwxr-xr-xpython/persist_tool.py2
-rw-r--r--python/tests/test_grobid.py8
-rw-r--r--python/tests/test_pushers.py2
-rw-r--r--python/tests/test_wayback.py2
5 files changed, 23 insertions, 9 deletions
diff --git a/python/grobid_tool.py b/python/grobid_tool.py
index ad7841d..dc9780d 100755
--- a/python/grobid_tool.py
+++ b/python/grobid_tool.py
@@ -36,13 +36,21 @@ def run_extract_cdx(args):
if args.jobs > 1:
worker = GrobidWorker(grobid_client, wayback_client, sink=None)
multi_worker = MultiprocessWrapper(worker, args.sink)
- pusher = CdxLinePusher(multi_worker, args.cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'],
- batch_size=args.jobs)
+ pusher = CdxLinePusher(
+ multi_worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ batch_size=args.jobs,
+ )
else:
worker = GrobidWorker(grobid_client, wayback_client, sink=args.sink)
- pusher = CdxLinePusher(worker, args.cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'])
+ pusher = CdxLinePusher(
+ worker,
+ args.cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
pusher.run()
def run_extract_zipfile(args):
diff --git a/python/persist_tool.py b/python/persist_tool.py
index 29345e2..a91d4c3 100755
--- a/python/persist_tool.py
+++ b/python/persist_tool.py
@@ -27,7 +27,7 @@ def run_cdx(args):
pusher = CdxLinePusher(
worker,
args.cdx_file,
- filter_http_statuses=[200],
+ filter_http_statuses=[200, 226],
filter_mimetypes=filter_mimetypes,
#allow_octet_stream
batch_size=200,
diff --git a/python/tests/test_grobid.py b/python/tests/test_grobid.py
index 8c5e080..24ea40e 100644
--- a/python/tests/test_grobid.py
+++ b/python/tests/test_grobid.py
@@ -64,8 +64,12 @@ def test_grobid_worker_cdx(grobid_client, wayback_client):
body=REAL_TEI_XML, content_type='text/xml')
with open('tests/files/example.cdx', 'r') as cdx_file:
- pusher = CdxLinePusher(worker, cdx_file,
- filter_http_statuses=[200], filter_mimetypes=['application/pdf'])
+ pusher = CdxLinePusher(
+ worker,
+ cdx_file,
+ filter_http_statuses=[200, 226],
+ filter_mimetypes=['application/pdf'],
+ )
pusher_counts = pusher.run()
assert pusher_counts['total']
assert pusher_counts['pushed'] == 7
diff --git a/python/tests/test_pushers.py b/python/tests/test_pushers.py
index ed9c0bb..52f26c0 100644
--- a/python/tests/test_pushers.py
+++ b/python/tests/test_pushers.py
@@ -19,7 +19,7 @@ def test_cdx_line_pusher():
# HTTP 200 and application/pdf
with open('tests/files/example.cdx', 'r') as cdx_file:
pusher = CdxLinePusher(sink, cdx_file,
- filter_mimetypes=['application/pdf'], filter_http_statuses=[200])
+ filter_mimetypes=['application/pdf'], filter_http_statuses=[200, 226])
counts = pusher.run()
assert counts['total'] == 20
assert counts['skip-parse'] == 1
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index efffbe2..6bc1ca4 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -119,6 +119,7 @@ def wayback_client(cdx_client, mocker):
resource = mocker.Mock()
client.rstore.load_resource = mocker.MagicMock(return_value=resource)
resource.get_status = mocker.MagicMock(return_value=(200, "Ok"))
+ resource.is_revisit = mocker.MagicMock(return_value=False)
resource.get_location = mocker.MagicMock(return_value=WARC_TARGET)
body = mocker.Mock()
resource.open_raw_content = mocker.MagicMock(return_value=body)
@@ -141,6 +142,7 @@ def wayback_client_pdf(cdx_client, mocker):
resource = mocker.Mock()
client.rstore.load_resource = mocker.MagicMock(return_value=resource)
resource.get_status = mocker.MagicMock(return_value=(200, "Ok"))
+ resource.is_revisit = mocker.MagicMock(return_value=False)
resource.get_location = mocker.MagicMock(return_value=WARC_TARGET)
body = mocker.Mock()
resource.open_raw_content = mocker.MagicMock(return_value=body)