aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-14 16:06:19 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-14 16:06:19 -0800
commit4bb341270907f91b0475a7cdb00a7d280a80c06c (patch)
treecb983438f7ff43179831b605f8fb8704aa7d2860 /python
parentba6f16a02cfde0e4acb499c00b456b42472c0b00 (diff)
downloadsandcrawler-4bb341270907f91b0475a7cdb00a7d280a80c06c.tar.gz
sandcrawler-4bb341270907f91b0475a7cdb00a7d280a80c06c.zip
SPNv2 doesn't support FTP; add a live test for non-revist FTP
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py10
-rw-r--r--python/tests/test_live_wayback.py16
2 files changed, 26 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 02e71be..b71a20d 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -643,6 +643,16 @@ class SavePageNowClient:
"""
if not (self.ia_access_key and self.ia_secret_key):
raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)")
+ if request_url.startswith("ftp://"):
+ return SavePageNowResult(
+ False,
+ "spn2-no-ftp",
+ None,
+ request_url,
+ None,
+ None,
+ None,
+ )
resp = self.v2_session.post(
self.v2endpoint,
data={
diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py
index f15c63e..4f7daef 100644
--- a/python/tests/test_live_wayback.py
+++ b/python/tests/test_live_wayback.py
@@ -121,13 +121,29 @@ def test_cdx_fetch_spn2(cdx_client):
def test_lookup_ftp(wayback_client):
# ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf
# ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf
+ # ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf
+ # revisit!
url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf"
resp = wayback_client.lookup_resource(url)
assert resp.hit == True
assert resp.status == "success"
assert resp.terminal_url == url
+ assert resp.terminal_status_code == 226
+ assert resp.cdx.url == url
+
+ file_meta = gen_file_metadata(resp.body)
+ assert file_meta['sha1hex'] == resp.cdx.sha1hex
+
+ # not revisit?
+ url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf"
+ resp = wayback_client.lookup_resource(url)
+
+ assert resp.hit == True
+ assert resp.status == "success"
+ assert resp.terminal_url == url
+ assert resp.terminal_status_code == 226
assert resp.cdx.url == url
file_meta = gen_file_metadata(resp.body)