diff options
Diffstat (limited to 'python/tests/test_live_wayback.py')
-rw-r--r-- | python/tests/test_live_wayback.py | 54 |
1 files changed, 34 insertions, 20 deletions
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index 429c6b0..9bd8b5f 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -1,4 +1,3 @@ - """ This file contains tests to run against "live" wayback services. They default to "skip" because you need authentication, and we shouldn't hit these services @@ -7,10 +6,9 @@ automatically in CI. Simply uncomment lines to run. """ -import json import pytest -from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError, SavePageNowClient, SavePageNowError, CdxPartial, gen_file_metadata +from sandcrawler import CdxApiClient, SavePageNowClient, WaybackClient, gen_file_metadata @pytest.fixture @@ -18,16 +16,19 @@ def cdx_client(): client = CdxApiClient() return client + @pytest.fixture def wayback_client(): client = WaybackClient() return client + @pytest.fixture def spn_client(): client = SavePageNowClient() return client + @pytest.mark.skip(reason="hits prod services, requires auth") def test_cdx_fetch(cdx_client): @@ -42,12 +43,16 @@ def test_cdx_fetch(cdx_client): assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV" assert resp.warc_csize == 25338 assert resp.warc_offset == 240665973 - assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz" + assert ( + resp.warc_path + == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz" + ) # bogus datetime; shouldn't match with pytest.raises(KeyError): resp = cdx_client.fetch(url, "12345678123456") + @pytest.mark.skip(reason="hits prod services, requires auth") def test_cdx_lookup_best(cdx_client): @@ -66,24 +71,31 @@ def test_cdx_lookup_best(cdx_client): assert resp.mimetype == "text/html" assert resp.status_code == 200 + @pytest.mark.skip(reason="hits prod services, requires auth") def test_wayback_fetch(wayback_client): - resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz") + resp = wayback_client.fetch_petabox( + 25683, + 2676464871, + "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz", + ) assert resp.body + @pytest.mark.skip(reason="hits prod services, requires auth") def test_lookup_resource_success(wayback_client): url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url in (url, url.replace("https://", "http://")) assert resp.cdx.url in (url, url.replace("https://", "http://")) + @pytest.mark.skip(reason="hits prod services, requires auth") def test_cdx_fetch_spn2(cdx_client): @@ -104,9 +116,9 @@ def test_cdx_fetch_spn2(cdx_client): # https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 20200110222410 - #com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz -#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz -#com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz + # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 200 VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL - - 9006 815069841 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz + # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222410 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1096 815066572 liveweb-20200110214015-wwwb-spn18.us.archive.org-8002.warc.gz + # com,wiley,onlinelibrary)/doi/pdf/10.1002/lrh2.10209 20200110222422 https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209 text/html 302 AFI55BZE23HDTTEERUFKRP6WQVO3LOLS - - 1094 307563475 liveweb-20200110214449-wwwb-spn18.us.archive.org-8003.warc.gz url = "https://onlinelibrary.wiley.com/doi/pdf/10.1002/lrh2.10209" datetime = "20200110222410" @@ -117,6 +129,7 @@ def test_cdx_fetch_spn2(cdx_client): assert resp.sha1b32 == "VYW7JXFK6EC2KC537N5B7PHYZC4B6MZL" assert resp.status_code == 200 + @pytest.mark.skip(reason="hits prod services, requires auth") def test_lookup_ftp(wayback_client): # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf @@ -127,29 +140,30 @@ def test_lookup_ftp(wayback_client): url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url == url - assert resp.terminal_status_code == 226 + assert resp.terminal_status_code in (226, 200) assert resp.cdx.url == url assert resp.revisit_cdx assert resp.revisit_cdx.url != url file_meta = gen_file_metadata(resp.body) - assert file_meta['sha1hex'] == resp.cdx.sha1hex + assert file_meta["sha1hex"] == resp.cdx.sha1hex # not revisit? url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf" resp = wayback_client.lookup_resource(url) - assert resp.hit == True + assert resp.hit is True assert resp.status == "success" assert resp.terminal_url == url - assert resp.terminal_status_code == 226 + assert resp.terminal_status_code in (226, 200) assert resp.cdx.url == url file_meta = gen_file_metadata(resp.body) - assert file_meta['sha1hex'] == resp.cdx.sha1hex + assert file_meta["sha1hex"] == resp.cdx.sha1hex + @pytest.mark.skip(reason="hits prod services, requires auth") def test_crawl_ftp(spn_client, wayback_client): @@ -158,10 +172,10 @@ def test_crawl_ftp(spn_client, wayback_client): resp = spn_client.crawl_resource(url, wayback_client) # FTP isn't supported yet! - #assert resp.hit == True - #assert resp.status == "success" - #assert resp.terminal_url == url - #assert resp.cdx.url == url + # assert resp.hit is True + # assert resp.status == "success" + # assert resp.terminal_url == url + # assert resp.cdx.url == url - assert resp.hit == False + assert resp.hit is False assert resp.status == "spn2-no-ftp" |