1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
import json
import pytest
import responses
from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
CDX_TARGET = "http://fatcat.wiki/"
CDX_DT = "20180812220054"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_SINGLE_HIT = [
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
]
CDX_BEST_SHA1B32 = "AAAAAAAAASIHDJIEP7ZW53DLRX5NFIJR"
# cdx -m exact -p output=json -p from=20180812220054 -p to=20180812220054 http://fatcat.wiki/
CDX_MULTI_HIT = [
["urlkey","timestamp","original","mimetype","statuscode","digest","redirect","robotflags","length","offset","filename"],
["wiki,fatcat)/", CDX_DT, CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
# sooner, but not right mimetype
["wiki,fatcat)/", "20180912220054", CDX_TARGET, "text/html", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
# sooner and mimetype, but wrong status code
["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "400", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "500", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
["wiki,fatcat)/", "20180912220054", CDX_TARGET, "application/pdf", "150", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
# "best"
["wiki,fatcat)/", CDX_DT, CDX_TARGET, "application/pdf", "200", CDX_BEST_SHA1B32, "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
# older
["wiki,fatcat)/", "20180712220054", CDX_TARGET, "application/pdf", "200", "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR", "-", "-", "8445", "108062304", "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"],
]
@pytest.fixture
def cdx_client():
client = CdxApiClient(
host_url="http://dummy-cdx/cdx",
cdx_auth_token="dummy-token",
)
return client
@responses.activate
def test_cdx_fetch(cdx_client):
responses.add(responses.GET,
'http://dummy-cdx/cdx',
status=200,
body=json.dumps(CDX_SINGLE_HIT))
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 1
assert resp.datetime == CDX_DT
assert resp.url == CDX_TARGET
assert resp.sha1b32 == "O5RHV6OQ7SIHDJIEP7ZW53DLRX5NFIJR"
assert resp.warc_csize == "8445"
assert resp.warc_offset == "108062304"
assert resp.warc_path == "WIDE-20180810142205-crawl802/WIDE-20180812131623-00059.warc.gz"
@responses.activate
def test_cdx_fetch_errors(cdx_client):
with pytest.raises(ValueError):
resp = cdx_client.fetch(CDX_TARGET, "2019")
responses.add(responses.GET,
'http://dummy-cdx/cdx',
status=200,
body=json.dumps(CDX_SINGLE_HIT))
with pytest.raises(KeyError):
resp = cdx_client.fetch(CDX_TARGET, "20180812220055")
with pytest.raises(KeyError):
resp = cdx_client.fetch("http://some-other.com", CDX_DT)
resp = cdx_client.fetch(CDX_TARGET, CDX_DT)
assert len(responses.calls) == 3
@responses.activate
def test_cdx_lookup_best(cdx_client):
responses.add(responses.GET,
'http://dummy-cdx/cdx',
status=200,
body=json.dumps(CDX_MULTI_HIT))
resp = cdx_client.lookup_best(CDX_TARGET, best_mimetype="application/pdf")
assert len(responses.calls) == 1
assert resp.datetime == CDX_DT
assert resp.url == CDX_TARGET
assert resp.sha1b32 == CDX_BEST_SHA1B32
assert resp.warc_path == CDX_SINGLE_HIT[1][-1]
WARC_TARGET = "http://fatcat.wiki/"
WARC_BODY = b"<html>some stuff</html>"
@pytest.fixture
def wayback_client(cdx_client, mocker):
client = WaybackClient(
cdx_client=cdx_client,
petabox_webdata_secret="dummy-petabox-secret",
)
# mock out the wayback store with mock stuff
client.rstore = mocker.Mock()
resource = mocker.Mock()
client.rstore.load_resource = mocker.MagicMock(return_value=resource)
resource.get_status = mocker.MagicMock(return_value=[200])
resource.get_location = mocker.MagicMock(return_value=[WARC_TARGET])
body = mocker.Mock()
resource.open_raw_content = mocker.MagicMock(return_value=body)
body.read = mocker.MagicMock(return_value=WARC_BODY)
return client
def test_wayback_fetch(wayback_client, mocker):
resp = wayback_client.fetch_petabox(123, 456789, "here/there.warc.gz")
assert resp.body == WARC_BODY
assert resp.location == WARC_TARGET
resp = wayback_client.fetch_petabox_body(123, 456789, "here/there.warc.gz")
assert resp == WARC_BODY
|