aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-08 13:19:26 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-08 13:19:29 -0800
commit2035b62d6e46c1c57243ee3e68d1067a30791f54 (patch)
tree1d2632881b0ad4830594490ea8e2943b8e204494 /python/tests
parent1ca8b792709dde71f350827fdef6e6596dda55a0 (diff)
downloadsandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.tar.gz
sandcrawler-2035b62d6e46c1c57243ee3e68d1067a30791f54.zip
refactor CdxApiClient, add tests
- always use auth token and get full CDX rows - simplify to "fetch" (exact url/dt match) and "lookup_best" methods - all redirect stuff will be moved to a higher level
Diffstat (limited to 'python/tests')
-rw-r--r--python/tests/test_wayback.py110
1 files changed, 110 insertions, 0 deletions
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
new file mode 100644
index 0000000..7e63ec7
--- /dev/null
+++ b/python/tests/test_wayback.py
@@ -0,0 +1,110 @@
+
+import json
+import pytest
+import responses
+
+from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
+
+
+CDX_TARGET = "http://fatcat.wiki/"
+CDX_DT = "20180812220054"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_SINGLE_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
+# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
+CDX_MULTI_HIT = [
+ ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner, but not right mimetype
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # sooner and mimetype, but wrong status code
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # "best"
+ ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ # older
+ ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+]
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient(
+ host_url="http://dummy-cdx/cdx",
+ )
+ return client
+
+@pytest.fixture
+def wayback_client(cdx_client):
+ client = WaybackClient(
+ cdx_client=cdx_client,
+ petabox_webdata_secret="dummy-petabox-secret",
+ )
+ return client
+
+@responses.activate
+def test_cdx_fetch(cdx_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR"
+ assert resp.warc_csize == "8445"
+ assert resp.warc_offset == "108062304"
+ assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
+@responses.activate
+def test_cdx_fetch_errors(cdx_client):
+
+ with pytest.raises(ValueError):
+ resp = cdx_client.fetch(CDX_TARGET, "2019")
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
+
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch("http://some-other.com", CDX_DT)
+
+ resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
+ assert len(responses.calls) == 3
+
+@responses.activate
+def test_cdx_lookup_best(cdx_client):
+
+ responses.add(responses.GET,
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
+
+ resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
+
+ assert len(responses.calls) == 1
+
+ assert resp.datetime == CDX_DT
+ assert resp.url == CDX_TARGET
+ assert resp.sha1b32 == CDX_BEST_SHA1B32
+ assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
+def test_wayback_fetch(wayback_client, mocker):
+ # mock something
+ #mocker.patch('fatcat_tools.harvest.harvest_common.HarvestState.initialize_from_kafka')
+ #blah = mocker.Mock()
+ return
+