aboutsummaryrefslogtreecommitdiffstats
path: root/python/tests/test_wayback.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/tests/test_wayback.py')
-rw-r--r--python/tests/test_wayback.py100
1 files changed, 73 insertions, 27 deletions
diff --git a/python/tests/test_wayback.py b/python/tests/test_wayback.py
index 83311b9..6ccf775 100644
--- a/python/tests/test_wayback.py
+++ b/python/tests/test_wayback.py
@@ -1,4 +1,3 @@
-
import json
import pytest
@@ -10,27 +9,66 @@ CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+ "robotflags", "length", "offset", "filename"
+ ],
+ [
+ "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
]
CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
- ["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner, but not right mimetype
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # sooner and mimetype, but wrong status code
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- ["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # "best"
- ["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
- # older
- ["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
+ [
+ "urlkey", "timestamp", "original", "mimetype", "statuscode", "digest", "redirect",
+ "robotflags", "length", "offset", "filename"
+ ],
+ [
+ "wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # sooner, but not right mimetype
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # sooner and mimetype, but wrong status code
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ [
+ "wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # "best"
+ [
+ "wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-",
+ "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
+ # older
+ [
+ "wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200",
+ "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304",
+ "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+ ],
]
+
@pytest.fixture
def cdx_client():
client = CdxApiClient(
@@ -39,13 +77,14 @@ def cdx_client():
)
return client
+
@responses.activate
def test_cdx_fetch(cdx_client):
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
@@ -58,6 +97,7 @@ def test_cdx_fetch(cdx_client):
assert resp.warc_offset == 108062304
assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
+
@responses.activate
def test_cdx_fetch_errors(cdx_client):
@@ -65,9 +105,9 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, "2019")
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_SINGLE_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_SINGLE_HIT))
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
@@ -78,13 +118,14 @@ def test_cdx_fetch_errors(cdx_client):
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
+
@responses.activate
def test_cdx_lookup_best(cdx_client):
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
@@ -95,6 +136,7 @@ def test_cdx_lookup_best(cdx_client):
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
+
WARC_TARGET = "http://fatcat.wiki/"
WARC_BODY = b"""
<html>
@@ -108,6 +150,7 @@ WARC_BODY = b"""
</html>
"""
+
@pytest.fixture
def wayback_client(cdx_client, mocker):
client = WaybackClient(
@@ -127,6 +170,7 @@ def wayback_client(cdx_client, mocker):
return client
+
@pytest.fixture
def wayback_client_pdf(cdx_client, mocker):
@@ -150,6 +194,7 @@ def wayback_client_pdf(cdx_client, mocker):
return client
+
@responses.activate
def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
@@ -159,13 +204,14 @@ def test_wayback_fetch(wayback_client):
resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
assert resp == WARC_BODY
+
@responses.activate
def test_lookup_resource_success(wayback_client):
responses.add(responses.GET,
- 'http://dummy-cdx/cdx',
- status=200,
- body=json.dumps(CDX_MULTI_HIT))
+ 'http://dummy-cdx/cdx',
+ status=200,
+ body=json.dumps(CDX_MULTI_HIT))
resp = wayback_client.lookup_resource(CDX_TARGET)