diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 18:45:58 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:13:53 -0700 |
commit | 0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de (patch) | |
tree | 2a2d4386acc27941daab1ad33b48f3226823a22e | |
parent | 41c79a8bd9b01ba52dc19e7e3ba13221bf23c560 (diff) | |
download | sandcrawler-0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de.tar.gz sandcrawler-0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de.zip |
wayback 404 test
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 3 | ||||
-rw-r--r-- | mapreduce/tests/test_extraction_cdx_grobid.py | 51 |
2 files changed, 49 insertions, 5 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 9a0d795..708e170 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -109,7 +109,8 @@ class MRExtractCdxGrobid(MRJob): rstore = ResourceStore(loaderfactory=CDXLoaderFactory()) gwb_record = rstore.load_resource(warc_uri, offset, c_size) except wayback.exception.ResourceUnavailable as err: - return None, dict(status="petabox_error", reason="failed to load file contents") + return None, dict(status="error", + reason="failed to load file contents from wayback/petabox") if gwb_record.get_status()[0] != 200: return None, dict(status="error", diff --git a/mapreduce/tests/test_extraction_cdx_grobid.py b/mapreduce/tests/test_extraction_cdx_grobid.py index 729e68b..8549054 100644 --- a/mapreduce/tests/test_extraction_cdx_grobid.py +++ b/mapreduce/tests/test_extraction_cdx_grobid.py @@ -6,8 +6,9 @@ import pytest import struct import responses import happybase_mock +import wayback.exception from unittest import mock -from extraction_cdx_grobid import MRExtractCdxGrobid +from extraction_cdx_grobid import MRExtractCdxGrobid, Resource FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -192,7 +193,6 @@ def test_grobid_not_xml(mock_fetch, job): @mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None)) -@responses.activate def test_grobid_invalid_connection(mock_fetch, job): status = b'{"status": "done broke"}' @@ -202,8 +202,51 @@ def test_grobid_invalid_connection(mock_fetch, job): output = io.BytesIO() job.sandbox(stdin=raw, stdout=output) - #with pytest.raises... job.run_mapper() + output = output.getvalue().decode('utf-8') + assert 'error' in output + assert 'GROBID' in output + assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {} + + +def test_wayback_failure(job): + + job.options.warc_uri_prefix = 'http://host.invalid/' + + raw = io.BytesIO(b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz""") + + output = io.BytesIO() + job.sandbox(stdin=raw, stdout=output) + job.run_mapper() + output = output.getvalue().decode('utf-8') + assert 'error' in output + assert 'wayback' in output + assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {} + + +@mock.patch('extraction_cdx_grobid.ResourceStore') +def test_wayback_not_found(mock_rs, job): + + # This is... a little convoluded. Basically creating a 404 situation for + # reading a wayback resource. + mock_resource = mock.MagicMock() + mock_resource.get_status.return_value = (404, "Not Found") + mock_rso = mock.MagicMock() + mock_rso.load_resource.return_value = mock_resource + mock_rs.return_value = mock_rso + print(mock_rs().load_resource().get_status()) + + job.options.warc_uri_prefix = 'http://dummy-archive.org/' + + raw = io.BytesIO(b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz""") + + output = io.BytesIO() + job.sandbox(stdin=raw, stdout=output) + job.run_mapper() + output = output.getvalue().decode('utf-8') + + print(output) + assert 'error' in output + assert 'not 200' in output assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {} -# TODO: failure to fetch from wayback |