diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 18:45:58 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-10 19:13:53 -0700 | 
| commit | 0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de (patch) | |
| tree | 2a2d4386acc27941daab1ad33b48f3226823a22e /mapreduce | |
| parent | 41c79a8bd9b01ba52dc19e7e3ba13221bf23c560 (diff) | |
| download | sandcrawler-0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de.tar.gz sandcrawler-0a778bd6d46a71b7cbec04eb3a5bdb00d91da0de.zip | |
wayback 404 test
Diffstat (limited to 'mapreduce')
| -rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 3 | ||||
| -rw-r--r-- | mapreduce/tests/test_extraction_cdx_grobid.py | 51 | 
2 files changed, 49 insertions, 5 deletions
| diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 9a0d795..708e170 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -109,7 +109,8 @@ class MRExtractCdxGrobid(MRJob):              rstore = ResourceStore(loaderfactory=CDXLoaderFactory())              gwb_record = rstore.load_resource(warc_uri, offset, c_size)          except wayback.exception.ResourceUnavailable as err: -            return None, dict(status="petabox_error", reason="failed to load file contents") +            return None, dict(status="error", +                reason="failed to load file contents from wayback/petabox")          if gwb_record.get_status()[0] != 200:              return None, dict(status="error", diff --git a/mapreduce/tests/test_extraction_cdx_grobid.py b/mapreduce/tests/test_extraction_cdx_grobid.py index 729e68b..8549054 100644 --- a/mapreduce/tests/test_extraction_cdx_grobid.py +++ b/mapreduce/tests/test_extraction_cdx_grobid.py @@ -6,8 +6,9 @@ import pytest  import struct  import responses  import happybase_mock +import wayback.exception  from unittest import mock -from extraction_cdx_grobid import MRExtractCdxGrobid +from extraction_cdx_grobid import MRExtractCdxGrobid, Resource  FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843) @@ -192,7 +193,6 @@ def test_grobid_not_xml(mock_fetch, job):  @mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None)) -@responses.activate  def test_grobid_invalid_connection(mock_fetch, job):      status = b'{"status": "done broke"}' @@ -202,8 +202,51 @@ def test_grobid_invalid_connection(mock_fetch, job):      output = io.BytesIO()      job.sandbox(stdin=raw, stdout=output) -    #with pytest.raises...      job.run_mapper() +    output = output.getvalue().decode('utf-8') +    assert 'error' in output +    assert 'GROBID' in output +    assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {} + + +def test_wayback_failure(job): + +    job.options.warc_uri_prefix = 'http://host.invalid/' + +    raw = io.BytesIO(b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz""") + +    output = io.BytesIO() +    job.sandbox(stdin=raw, stdout=output) +    job.run_mapper() +    output = output.getvalue().decode('utf-8') +    assert 'error' in output +    assert 'wayback' in output +    assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {} + + +@mock.patch('extraction_cdx_grobid.ResourceStore') +def test_wayback_not_found(mock_rs, job): + +    # This is... a little convoluded. Basically creating a 404 situation for +    # reading a wayback resource. +    mock_resource = mock.MagicMock() +    mock_resource.get_status.return_value = (404, "Not Found") +    mock_rso = mock.MagicMock() +    mock_rso.load_resource.return_value = mock_resource +    mock_rs.return_value = mock_rso +    print(mock_rs().load_resource().get_status()) + +    job.options.warc_uri_prefix = 'http://dummy-archive.org/' + +    raw = io.BytesIO(b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz""") + +    output = io.BytesIO() +    job.sandbox(stdin=raw, stdout=output) +    job.run_mapper() +    output = output.getvalue().decode('utf-8') + +    print(output) +    assert 'error' in output +    assert 'not 200' in output      assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {} -# TODO: failure to fetch from wayback | 
