diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 16:06:19 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-14 16:06:19 -0800 |
commit | 4bb341270907f91b0475a7cdb00a7d280a80c06c (patch) | |
tree | cb983438f7ff43179831b605f8fb8704aa7d2860 /python | |
parent | ba6f16a02cfde0e4acb499c00b456b42472c0b00 (diff) | |
download | sandcrawler-4bb341270907f91b0475a7cdb00a7d280a80c06c.tar.gz sandcrawler-4bb341270907f91b0475a7cdb00a7d280a80c06c.zip |
SPNv2 doesn't support FTP; add a live test for non-revist FTP
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 10 | ||||
-rw-r--r-- | python/tests/test_live_wayback.py | 16 |
2 files changed, 26 insertions, 0 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 02e71be..b71a20d 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -643,6 +643,16 @@ class SavePageNowClient: """ if not (self.ia_access_key and self.ia_secret_key): raise Exception("SPN2 requires authentication (IA_ACCESS_KEY/IA_SECRET_KEY)") + if request_url.startswith("ftp://"): + return SavePageNowResult( + False, + "spn2-no-ftp", + None, + request_url, + None, + None, + None, + ) resp = self.v2_session.post( self.v2endpoint, data={ diff --git a/python/tests/test_live_wayback.py b/python/tests/test_live_wayback.py index f15c63e..4f7daef 100644 --- a/python/tests/test_live_wayback.py +++ b/python/tests/test_live_wayback.py @@ -121,13 +121,29 @@ def test_cdx_fetch_spn2(cdx_client): def test_lookup_ftp(wayback_client): # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/80/23/10.1177_1559827617708562.PMC6236633.pdf # ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf + # ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf + # revisit! url = "ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/ad/ab/mmr-17-05-6969.PMC5928650.pdf" resp = wayback_client.lookup_resource(url) assert resp.hit == True assert resp.status == "success" assert resp.terminal_url == url + assert resp.terminal_status_code == 226 + assert resp.cdx.url == url + + file_meta = gen_file_metadata(resp.body) + assert file_meta['sha1hex'] == resp.cdx.sha1hex + + # not revisit? + url = "ftp://ftp.cs.utexas.edu/pub/qsim/papers/Xu-crv-08.pdf" + resp = wayback_client.lookup_resource(url) + + assert resp.hit == True + assert resp.status == "success" + assert resp.terminal_url == url + assert resp.terminal_status_code == 226 assert resp.cdx.url == url file_meta = gen_file_metadata(resp.body) |