aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-15 14:31:52 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-15 14:31:52 -0700
commitfe18ff4478de8481b732dce1408a39b1d3c2795d (patch)
tree9396025636ede4e76dc0aa509a120ae621d08f31
parent0c0f9714724e65c0b12ac9c76132c6ab1590e823 (diff)
downloadsandcrawler-fe18ff4478de8481b732dce1408a39b1d3c2795d.tar.gz
sandcrawler-fe18ff4478de8481b732dce1408a39b1d3c2795d.zip
wayback: use same 5xx/4xx-allowing tricks for replay body fetch as for replay redirect
-rw-r--r--python/sandcrawler/ia.py14
1 files changed, 7 insertions, 7 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 227f7d0..e08031e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -617,7 +617,7 @@ class WaybackClient:
assert datetime.isdigit()
try:
- resp = self.http_session.get(
+ resp = self.record_http_session.get(
self.wayback_endpoint + datetime + "id_/" + url,
allow_redirects=False,
headers=self.replay_headers,
@@ -635,14 +635,14 @@ class WaybackClient:
)
)
- try:
- resp.raise_for_status()
- except Exception as e:
- raise WaybackError(str(e))
- # print(resp.url, file=sys.stderr)
-
# defensively check that this is actually correct replay based on headers
if "X-Archive-Src" not in resp.headers:
+ # check if this was an error first
+ try:
+ resp.raise_for_status()
+ except Exception as e:
+ raise WaybackError(str(e))
+ # otherwise, a weird case (200/redirect but no Src header
raise WaybackError("replay fetch didn't return X-Archive-Src in headers")
if datetime not in resp.url:
raise WaybackError(