diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 13:19:26 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-08 13:19:29 -0800 |
commit | 2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch) | |
tree | 1d2632881b0ad4830594490ea8e2943b8e204494 /python/tests | |
parent | 1ca8b792709dde71f350827fdef6e6596dda55a0 (diff) | |
download | sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip |
refactor CdxApiClient, add tests
- always use auth token and get full CDX rows
- simplify to "fetch" (exact url/dt match) and "lookup_best" methods
- all redirect stuff will be moved to a higher level
Diffstat (limited to 'python/tests')
-rw-r--r-- | python/tests/test_wayback.py | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py new file mode 100644 index 0000000..7e63ec7 --- /dev/null +++ b/python/tests/test_wayback.py @@ -0,0 +1,110 @@ + +import json +import pytest +import responses + +from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError + + +CDX_TARGET = "http://fatcat.wiki/" +CDX_DT = "20180812220054" +# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ +CDX_SINGLE_HIT = [ + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], +] + +CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR" +# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/ +CDX_MULTI_HIT = [ + ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"], + ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # sooner, but not right mimetype + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # sooner and mimetype, but wrong status code + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # "best" + ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], + # older + ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"], +] + +@pytest.fixture +def cdx_client(): + client = CdxApiClient( + host_url="http://dummy-cdx/cdx", + ) + return client + +@pytest.fixture +def wayback_client(cdx_client): + client = WaybackClient( + cdx_client=cdx_client, + petabox_webdata_secret="dummy-petabox-secret", + ) + return client + +@responses.activate +def test_cdx_fetch(cdx_client): + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SINGLE_HIT)) + + resp = cdx_client.fetch(CDX_TARGET, CDX_DT) + + assert len(responses.calls) == 1 + + assert resp.datetime == CDX_DT + assert resp.url == CDX_TARGET + assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR" + assert resp.warc_csize == "8445" + assert resp.warc_offset == "108062304" + assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz" + +@responses.activate +def test_cdx_fetch_errors(cdx_client): + + with pytest.raises(ValueError): + resp = cdx_client.fetch(CDX_TARGET, "2019") + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_SINGLE_HIT)) + + with pytest.raises(KeyError): + resp = cdx_client.fetch(CDX_TARGET, "20180812220055") + + with pytest.raises(KeyError): + resp = cdx_client.fetch("http://some-other.com", CDX_DT) + + resp = cdx_client.fetch(CDX_TARGET, CDX_DT) + assert len(responses.calls) == 3 + +@responses.activate +def test_cdx_lookup_best(cdx_client): + + responses.add(responses.GET, + 'http://dummy-cdx/cdx', + status=200, + body=json.dumps(CDX_MULTI_HIT)) + + resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf") + + assert len(responses.calls) == 1 + + assert resp.datetime == CDX_DT + assert resp.url == CDX_TARGET + assert resp.sha1b32 == CDX_BEST_SHA1B32 + assert resp.warc_path == CDX_SINGLE_HIT[1][-1] + +def test_wayback_fetch(wayback_client, mocker): + # mock something + #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka') + #blah = mocker.Mock() + return + |