python/tests/test_live_wayback.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

"""
This file contains tests to run against "live" wayback services. They default
to "skip" because you need authentication, and we shouldn't hit these services
automatically in CI.

Simply uncomment lines to run.
"""

import pytest

from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata


@pytest.fixture
def cdx_client():
    client = CdxApiClient()
    return client


@pytest.fixture
def wayback_client():
    client = WaybackClient()
    return client


@pytest.fixture
def spn_client():
    client = SavePageNowClient()
    return client


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch(cdx_client):

    # org,plos,journals)/plosone/article?id=10.1371/journal.pone.0093949 20181105121428 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949 text/html 200 OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV - - 25338 240665973 MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz

    url = "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949"
    datetime = "20181105121428"
    resp = cdx_client.fetch(url, datetime)

    assert resp.url == url
    assert resp.datetime == datetime
    assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
    assert resp.warc_csize == 25338
    assert resp.warc_offset == 240665973
    assert (
        resp.warc_path
        == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
    )

    # bogus datetime; shouldn't match
    with pytest.raises(KeyError):
        resp = cdx_client.fetch(url, "12345678123456")


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_lookup_best(cdx_client):

    url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
    resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")

    # won't know datetime, hash, etc
    assert resp.url in (url, url.replace("https://", "http://"))
    assert resp.mimetype == "application/pdf"
    assert resp.status_code == 200

    url = "https://americanarchivist.org/doi/abs/10.17723/aarc.62.2.gu33570g87v71007"
    resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")

    assert resp.url in (url, url.replace("https://", "http://"))
    assert resp.mimetype == "text/html"
    assert resp.status_code == 200


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_wayback_fetch(wayback_client):

    resp = wayback_client.fetch_petabox(
        25683,
        2676464871,
        "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz",
    )

    assert resp.body


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_resource_success(wayback_client):

    url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
    resp = wayback_client.lookup_resource(url)

    assert resp.hit is True
    assert resp.status == "success"
    assert resp.terminal_url in (url, url.replace("https://", "http://"))
    assert resp.cdx.url in (url, url.replace("https://", "http://"))


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_cdx_fetch_spn2(cdx_client):

    # https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 20200110210133

    # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20191201203206 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 FPXVUJR7RXVGO6RIY5HYB6JVT7OD53SG - - 5026 364192270 liveweb-20191201204645/live-20191201195942-wwwb-app52.us.archive.org.warc.gz
    # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20200110210044 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 OIQ3TKPBQLYYXQDIG7D2ZOK7IJEUEAQ7 - - 5130 710652442 liveweb-20200110204521-wwwb-spn20.us.archive.org-8001.warc.gz
    # com,elsevier,linkinghub)/retrieve/pii/s2590109519300424 20200110210133 https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424 text/html 200 G2MSFAYELECMFGKTYEHUN66WWNW4HXKQ - - 5126 544508422 liveweb-20200110205247-wwwb-spn01.us.archive.org-8000.warc.gz

    url = "https://linkinghub.elsevier.com/retrieve/pii/S2590109519300424"
    datetime = "20200110210133"
    resp = cdx_client.fetch(url, datetime, filter_status_code=200)

    assert resp.url == url
    assert resp.datetime == datetime
    assert resp.sha1b32 == "G2MSFAYELECMFGKTYEHUN66WWNW4HXKQ"
    assert resp.status_code == 200

    # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410

    # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
    # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz
    # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz

    url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209"
    datetime = "20200110222410"
    resp = cdx_client.fetch(url, datetime, filter_status_code=200)

    assert resp.url == url
    assert resp.datetime == datetime
    assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL"
    assert resp.status_code == 200


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_lookup_ftp(wayback_client):
    # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
    # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf
    # ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf

    # revisit!
    url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
    resp = wayback_client.lookup_resource(url)

    assert resp.hit is True
    assert resp.status == "success"
    assert resp.terminal_url == url
    assert resp.terminal_status_code in (226, 200)
    assert resp.cdx.url == url
    assert resp.revisit_cdx
    assert resp.revisit_cdx.url != url

    file_meta = gen_file_metadata(resp.body)
    assert file_meta["sha1hex"] == resp.cdx.sha1hex

    # not revisit?
    url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
    resp = wayback_client.lookup_resource(url)

    assert resp.hit is True
    assert resp.status == "success"
    assert resp.terminal_url == url
    assert resp.terminal_status_code in (226, 200)
    assert resp.cdx.url == url

    file_meta = gen_file_metadata(resp.body)
    assert file_meta["sha1hex"] == resp.cdx.sha1hex


@pytest.mark.skip(reason="hits prod services, requires auth")
def test_crawl_ftp(spn_client, wayback_client):

    url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
    resp = spn_client.crawl_resource(url, wayback_client)

    # FTP isn't supported yet!
    # assert resp.hit is True
    # assert resp.status == "success"
    # assert resp.terminal_url == url
    # assert resp.cdx.url == url

    assert resp.hit is False
    assert resp.status == "spn2-no-ftp"