From fe18ff4478de8481b732dce1408a39b1d3c2795d Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 15 Jul 2022 14:31:52 -0700 Subject: wayback: use same 5xx/4xx-allowing tricks for replay body fetch as for replay redirect --- python/sandcrawler/ia.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 227f7d0..e08031e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -617,7 +617,7 @@ class WaybackClient: assert datetime.isdigit() try: - resp = self.http_session.get( + resp = self.record_http_session.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, headers=self.replay_headers, @@ -635,14 +635,14 @@ class WaybackClient: ) ) - try: - resp.raise_for_status() - except Exception as e: - raise WaybackError(str(e)) - # print(resp.url, file=sys.stderr) - # defensively check that this is actually correct replay based on headers if "X-Archive-Src" not in resp.headers: + # check if this was an error first + try: + resp.raise_for_status() + except Exception as e: + raise WaybackError(str(e)) + # otherwise, a weird case (200/redirect but no Src header raise WaybackError("replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: raise WaybackError( -- cgit v1.2.3