aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-14 15:00:26 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-14 15:00:26 -0700
commitb680c255508e6721185c6793bc872c0dc97864a0 (patch)
treec27dfc07fe39b9a4be79ff8b17da48d7d4e7009b /python/sandcrawler
parent2114cce448c5ff0424f667ba5298010722965d73 (diff)
downloadsandcrawler-b680c255508e6721185c6793bc872c0dc97864a0.tar.gz
sandcrawler-b680c255508e6721185c6793bc872c0dc97864a0.zip
ingest: handle another type of wayback redirect
Diffstat (limited to 'python/sandcrawler')
-rw-r--r--python/sandcrawler/ia.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 8462da1..7b9427e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -699,13 +699,16 @@ class WaybackClient:
except Exception as e:
if resp is not None and "X-Archive-Src" in resp.headers:
raise WaybackContentError(
- f"expected redirect record but got HTTP status {resp.status_code}"
+ f"expected redirect record but got captured HTTP status: {resp.status_code}"
)
raise WaybackError(str(e))
# defensively check that this is actually correct replay based on headers
# previously check for "X-Archive-Redirect-Reason" here
- if "X-Archive-Src" not in resp.headers:
+ if (
+ "X-Archive-Src" not in resp.headers
+ and "X-Archive-Redirect-Reason" not in resp.headers
+ ):
raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
raise WaybackError(