From b680c255508e6721185c6793bc872c0dc97864a0 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 Jul 2022 15:00:26 -0700 Subject: ingest: handle another type of wayback redirect --- python/sandcrawler/ia.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'python/sandcrawler') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 8462da1..7b9427e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -699,13 +699,16 @@ class WaybackClient: except Exception as e: if resp is not None and "X-Archive-Src" in resp.headers: raise WaybackContentError( - f"expected redirect record but got HTTP status {resp.status_code}" + f"expected redirect record but got captured HTTP status: {resp.status_code}" ) raise WaybackError(str(e)) # defensively check that this is actually correct replay based on headers # previously check for "X-Archive-Redirect-Reason" here - if "X-Archive-Src" not in resp.headers: + if ( + "X-Archive-Src" not in resp.headers + and "X-Archive-Redirect-Reason" not in resp.headers + ): raise WaybackError("redirect replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: raise WaybackError( -- cgit v1.2.3