diff options
Diffstat (limited to 'python/tests/test_wayback.py')
-rw-r--r-- | python/tests/test_wayback.py | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py new file mode 100644 index 0000000..da4dfd8 --- /dev/null +++ b/python/tests/test_wayback.py @@ -0,0 +1,297 @@ +import json + +import pytest +import responses + +from sandcrawler import CdxApiClient, WaybackClient + +CDX_TARGET = "http://fatcat.wiki/" +CDX_DT = "20180812220054" +# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ +CDX_SINGLE_HIT = [ + [ + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", + ], + [ + "wiki,fatcat)/", + CDX_DT, + CDX_TARGET, + "text/html", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], +] + +CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR" +# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ +CDX_MULTI_HIT = [ + [ + "urlkey", + "timestamp", + "original", + "mimetype", + "statuscode", + "digest", + "redirect", + "robotflags", + "length", + "offset", + "filename", + ], + [ + "wiki,fatcat)/", + CDX_DT, + CDX_TARGET, + "text/html", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], + # sooner, but not right mimetype + [ + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "text/html", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], + # sooner and mimetype, but wrong status code + [ + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "application/pdf", + "400", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], + [ + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "application/pdf", + "500", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], + [ + "wiki,fatcat)/", + "20180912220054", + CDX_TARGET, + "application/pdf", + "150", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], + # "best" + [ + "wiki,fatcat)/", + CDX_DT, + CDX_TARGET, + "application/pdf", + "200", + CDX_BEST_SHA1B32, + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], + # older + [ + "wiki,fatcat)/", + "20180712220054", + CDX_TARGET, + "application/pdf", + "200", + "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", + "-", + "-", + "8445", + "108062304", + "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz", + ], +] + + +@pytest.fixture +def cdx_client(): + client = CdxApiClient( + host_url="http://dummy-cdx/cdx", + cdx_auth_token="dummy-token", + ) + return client + + +@responses.activate +def test_cdx_fetch(cdx_client): + + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT) + ) + + resp = cdx_client.fetch(CDX_TARGET, CDX_DT) + + assert len(responses.calls) == 1 + + assert resp.datetime == CDX_DT + assert resp.url == CDX_TARGET + assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR" + assert resp.warc_csize == 8445 + assert resp.warc_offset == 108062304 + assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + + +@responses.activate +def test_cdx_fetch_errors(cdx_client): + + with pytest.raises(ValueError): + resp = cdx_client.fetch(CDX_TARGET, "2019") + + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_SINGLE_HIT) + ) + + with pytest.raises(KeyError): + resp = cdx_client.fetch(CDX_TARGET, "20180812220055") + + with pytest.raises(KeyError): + resp = cdx_client.fetch("http://some-other.com", CDX_DT) + + resp = cdx_client.fetch(CDX_TARGET, CDX_DT) + assert len(responses.calls) == 3 + assert resp + + +@responses.activate +def test_cdx_lookup_best(cdx_client): + + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT) + ) + + resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf") + + assert len(responses.calls) == 1 + + assert resp.datetime == CDX_DT + assert resp.url == CDX_TARGET + assert resp.sha1b32 == CDX_BEST_SHA1B32 + assert resp.warc_path == CDX_SINGLE_HIT[1][-1] + + +WARC_TARGET = "http://fatcat.wiki/" +WARC_BODY = b""" +<html> + <head> + <meta name="citation_pdf_url" content="http://www.example.com/content/271/20/11761.full.pdf"> + </head> + <body> + <h1>my big article here</h1> + blah + </body> +</html> +""" + + +@pytest.fixture +def wayback_client(cdx_client, mocker): + client = WaybackClient( + cdx_client=cdx_client, + petabox_webdata_secret="dummy-petabox-secret", + ) + # mock out the wayback store with mock stuff + client.rstore = mocker.Mock() + resource = mocker.Mock() + client.rstore.load_resource = mocker.MagicMock(return_value=resource) + resource.get_status = mocker.MagicMock(return_value=(200, "Ok")) + resource.is_revisit = mocker.MagicMock(return_value=False) + resource.get_location = mocker.MagicMock(return_value=WARC_TARGET) + body = mocker.Mock() + resource.open_raw_content = mocker.MagicMock(return_value=body) + body.read = mocker.MagicMock(return_value=WARC_BODY) + + return client + + +@pytest.fixture +def wayback_client_pdf(cdx_client, mocker): + + with open("tests/files/dummy.pdf", "rb") as f: + pdf_bytes = f.read() + + client = WaybackClient( + cdx_client=cdx_client, + petabox_webdata_secret="dummy-petabox-secret", + ) + # mock out the wayback store with mock stuff + client.rstore = mocker.Mock() + resource = mocker.Mock() + client.rstore.load_resource = mocker.MagicMock(return_value=resource) + resource.get_status = mocker.MagicMock(return_value=(200, "Ok")) + resource.is_revisit = mocker.MagicMock(return_value=False) + resource.get_location = mocker.MagicMock(return_value=WARC_TARGET) + body = mocker.Mock() + resource.open_raw_content = mocker.MagicMock(return_value=body) + body.read = mocker.MagicMock(return_value=pdf_bytes) + + return client + + +@responses.activate +def test_wayback_fetch(wayback_client): + resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz") + assert resp.body == WARC_BODY + assert resp.location == WARC_TARGET + + resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz") + assert resp == WARC_BODY + + +@responses.activate +def test_lookup_resource_success(wayback_client): + + responses.add( + responses.GET, "http://dummy-cdx/cdx", status=200, body=json.dumps(CDX_MULTI_HIT) + ) + + resp = wayback_client.lookup_resource(CDX_TARGET) + + assert resp.hit is True |