aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-09 16:51:59 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-09 16:51:59 -0800
commit00cf33a1c230c8ce5dcda41aba5dcc6a88264d46 (patch)
tree2b25cf54ae3e37063771862811ce4c07c8300140
parent56d33e58a6c385a4bffb6d27992b6708da55c360 (diff)
downloadsandcrawler-00cf33a1c230c8ce5dcda41aba5dcc6a88264d46.tar.gz
sandcrawler-00cf33a1c230c8ce5dcda41aba5dcc6a88264d46.zip
add (skipped) live tests for wayback services
-rw-r--r--python/tests/test_live_wayback.py73
1 files changed, 73 insertions, 0 deletions
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
new file mode 100644
index 0000000..7b9a1f2
--- /dev/null
+++ b/python/tests/test_live_wayback.py
@@ -0,0 +1,73 @@
+
+"""
+This file contains tests to run against "live" wayback services. They default
+to "skip" because you need authentication, and we shouldn't hit these services
+automatically in CI.
+
+Simply uncomment lines to run.
+"""
+
+import json
+import pytest
+
+from sandcrawler import CdxApiClient, CdxApiError, WaybackClient, WaybackError, PetaboxError
+
+
+@pytest.fixture
+def cdx_client():
+ client = CdxApiClient()
+ return client
+
+@pytest.fixture
+def wayback_client():
+ client = WaybackClient()
+ return client
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_fetch(cdx_client):
+
+ # org,plos,journals)/plosone/article?id=10.1371/journal.pone.0093949 20181105121428 https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949 text/html 200 OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV - - 25338 240665973 MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz
+
+ url = "https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0093949"
+ datetime = "20181105121428"
+ resp = cdx_client.fetch(url, datetime)
+
+ assert resp.url == url
+ assert resp.datetime == datetime
+ assert resp.sha1b32 == "OJ6FN5AAPU62VMMVJPXZYNBQD5VMYHFV"
+ assert resp.warc_csize == 25338
+ assert resp.warc_offset == 240665973
+ assert resp.warc_path == "MEDIACLOUD-20181105115107-crawl851/MEDIACLOUD-20181105115107-09234.warc.gz"
+
+ # bogus datetime; shouldn't match
+ with pytest.raises(KeyError):
+ resp = cdx_client.fetch(url, "12345678123456")
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_cdx_lookup_best(cdx_client):
+
+ url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
+ resp = cdx_client.lookup_best(url, best_mimetype="application/pdf")
+
+ # won't know datetime, hash, etc
+ assert resp.url in (url, url.replace("https://", "http://"))
+ assert resp.mimetype == "application/pdf"
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_wayback_fetch(wayback_client):
+
+ resp = wayback_client.fetch_petabox(25683, 2676464871, "archiveteam_archivebot_go_20171205210002/arstechnica.co.uk-inf-20171201-061309-bb65j-00021.warc.gz")
+
+ assert resp.body
+
+@pytest.mark.skip(reason="hits prod services, requires auth")
+def test_lookup_resource_success(wayback_client):
+
+ url = "https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0093949&type=printable"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.terminal_url in (url, url.replace("https://", "http://"))
+ assert resp.cdx.url in (url, url.replace("https://", "http://"))
+