diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-14 15:00:26 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-14 15:00:26 -0700 |
commit | b680c255508e6721185c6793bc872c0dc97864a0 (patch) | |
tree | c27dfc07fe39b9a4be79ff8b17da48d7d4e7009b /python | |
parent | 2114cce448c5ff0424f667ba5298010722965d73 (diff) | |
download | sandcrawler-b680c255508e6721185c6793bc872c0dc97864a0.tar.gz sandcrawler-b680c255508e6721185c6793bc872c0dc97864a0.zip |
ingest: handle another type of wayback redirect
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 8462da1..7b9427e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -699,13 +699,16 @@ class WaybackClient: except Exception as e: if resp is not None and "X-Archive-Src" in resp.headers: raise WaybackContentError( - f"expected redirect record but got HTTP status {resp.status_code}" + f"expected redirect record but got captured HTTP status: {resp.status_code}" ) raise WaybackError(str(e)) # defensively check that this is actually correct replay based on headers # previously check for "X-Archive-Redirect-Reason" here - if "X-Archive-Src" not in resp.headers: + if ( + "X-Archive-Src" not in resp.headers + and "X-Archive-Redirect-Reason" not in resp.headers + ): raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: raise WaybackError( |