aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-21 11:23:14 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-21 11:23:14 -0800
commit9affaecc956854621362dfa70cc78b2602a01402 (patch)
treea77119e9cf03d82bada23bde5730b0195344514c
parenta1b44161e206873be30c0640f5fab7a284023ba1 (diff)
downloadsandcrawler-9affaecc956854621362dfa70cc78b2602a01402.tar.gz
sandcrawler-9affaecc956854621362dfa70cc78b2602a01402.zip
wayback: replay redirects have X-Archive-Redirect-Reason
-rw-r--r--python/sandcrawler/ia.py6
1 files changed, 4 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 02258df..b5883e5 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -432,7 +432,8 @@ class WaybackClient:
(web.archive.org) instead of petabox.
Intended for use with SPN2 requests, where request body has not ended
- up in petabox yet.
+ up in petabox yet. For example, re-ingesting a base_url which was
+ recently crawler by SPNv2, where we are doing ingest via wayback path.
Returns None if response is found, but couldn't find redirect.
"""
@@ -457,7 +458,8 @@ class WaybackClient:
#print(resp.url, file=sys.stderr)
# defensively check that this is actually correct replay based on headers
- assert "X-Archive-Src" in resp.headers
+ if not "X-Archive-Redirect-Reason" in resp.headers:
+ raise WaybackError("redirect replay fetch didn't return X-Archive-Redirect-Reason in headers")
if not datetime in resp.url:
raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))