diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 14:31:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 14:31:52 -0700 |
commit | fe18ff4478de8481b732dce1408a39b1d3c2795d (patch) | |
tree | 9396025636ede4e76dc0aa509a120ae621d08f31 /python | |
parent | 0c0f9714724e65c0b12ac9c76132c6ab1590e823 (diff) | |
download | sandcrawler-fe18ff4478de8481b732dce1408a39b1d3c2795d.tar.gz sandcrawler-fe18ff4478de8481b732dce1408a39b1d3c2795d.zip |
wayback: use same 5xx/4xx-allowing tricks for replay body fetch as for replay redirect
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 14 |
1 files changed, 7 insertions, 7 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 227f7d0..e08031e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -617,7 +617,7 @@ class WaybackClient: assert datetime.isdigit() try: - resp = self.http_session.get( + resp = self.record_http_session.get( self.wayback_endpoint + datetime + "id_/" + url, allow_redirects=False, headers=self.replay_headers, @@ -635,14 +635,14 @@ class WaybackClient: ) ) - try: - resp.raise_for_status() - except Exception as e: - raise WaybackError(str(e)) - # print(resp.url, file=sys.stderr) - # defensively check that this is actually correct replay based on headers if "X-Archive-Src" not in resp.headers: + # check if this was an error first + try: + resp.raise_for_status() + except Exception as e: + raise WaybackError(str(e)) + # otherwise, a weird case (200/redirect but no Src header raise WaybackError("replay fetch didn't return X-Archive-Src in headers") if datetime not in resp.url: raise WaybackError( |