wayback fetch: handle upstream 5xx replays

author: Bryan Newbold <bnewbold@archive.org> 2022-07-13 11:41:46 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-07-13 11:41:46 -0700
commit: 0202f325acd6436bf4753e62fd8803f29f4bd027 (patch)
tree: 1d2efed714166174a8feec995ce58cec154aae1f /python/sandcrawler
parent: 28018ae483b0158142f4ffaf14bea7f6858d11ce (diff)
download: sandcrawler-0202f325acd6436bf4753e62fd8803f29f4bd027.tar.gz
sandcrawler-0202f325acd6436bf4753e62fd8803f29f4bd027.zip
1 files changed, 15 insertions, 4 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 6003f02..8462da1 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -396,6 +396,9 @@ class WaybackClient:
             "User-Agent": "Mozilla/5.0 sandcrawler.WaybackClient",
         }
         self.http_session = requests_retry_session()
+        self.record_http_session = requests_retry_session(
+            status_forcelist=[],
+        )
 
     def fetch_petabox(
         self, csize: int, offset: int, warc_path: str, resolve_revisit: bool = True
@@ -673,11 +676,18 @@ class WaybackClient:
         assert datetime.isdigit()
 
         try:
-            resp = self.http_session.get(
+            # when fetching via `id_`, it is possible to get a 5xx error which
+            # is either a wayback error, or an actual replay of an upstream 5xx
+            # error. the exception control flow here is tweaked, and a
+            # different HTTP session is used, to try and differentiate between
+            # the two cases
+            resp = None
+            resp = self.record_http_session.get(
                 self.wayback_endpoint + datetime + "id_/" + url,
                 allow_redirects=False,
                 headers=self.replay_headers,
             )
+            resp.raise_for_status()
         except requests.exceptions.TooManyRedirects:
             raise WaybackContentError("redirect loop (wayback replay fetch)")
         except UnicodeDecodeError:
@@ -686,11 +696,12 @@ class WaybackClient:
                     url
                 )
             )
-        try:
-            resp.raise_for_status()
         except Exception as e:
+            if resp is not None and "X-Archive-Src" in resp.headers:
+                raise WaybackContentError(
+                    f"expected redirect record but got HTTP status {resp.status_code}"
+                )
             raise WaybackError(str(e))
-        # print(resp.url, file=sys.stderr)
 
         # defensively check that this is actually correct replay based on headers
         # previously check for "X-Archive-Redirect-Reason" here
author	Bryan Newbold <bnewbold@archive.org>	2022-07-13 11:41:46 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-07-13 11:41:46 -0700
commit	0202f325acd6436bf4753e62fd8803f29f4bd027 (patch)
tree	1d2efed714166174a8feec995ce58cec154aae1f /python/sandcrawler
parent	28018ae483b0158142f4ffaf14bea7f6858d11ce (diff)
download	sandcrawler-0202f325acd6436bf4753e62fd8803f29f4bd027.tar.gz sandcrawler-0202f325acd6436bf4753e62fd8803f29f4bd027.zip