add support for redirect lookups from replay

author: Bryan Newbold <bnewbold@archive.org> 2020-01-10 16:08:46 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-10 16:08:46 -0800
commit: 1dbc2613d406f3f94bc0ea29621bc81eacc7cea3 (patch)
tree: 0a56d7bbf29f74f7cecc3a58f4e92a407d877f9d /python
parent: 2866ba252389ac9f3c595e7e7b6c9b4f6cf64663 (diff)
download: sandcrawler-1dbc2613d406f3f94bc0ea29621bc81eacc7cea3.tar.gz
sandcrawler-1dbc2613d406f3f94bc0ea29621bc81eacc7cea3.zip
1 files changed, 69 insertions, 9 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index cbf901a..da4d2b7 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -379,6 +379,47 @@ class WaybackClient:
                 )
         return resp.content
 
+    def fetch_replay_redirect(self, url, datetime):
+        """
+        Fetches an HTTP 3xx redirect Location from wayback via the replay interface
+        (web.archive.org) instead of petabox.
+
+        Intended for use with SPN2 requests, where request body has not ended
+        up in petabox yet.
+
+        Returns None if response is found, but couldn't find redirect.
+        """
+
+        # defensively check datetime format
+        assert len(datetime) == 14
+        assert datetime.isdigit()
+
+        try:
+            resp = requests.get(
+                self.wayback_endpoint + datetime + "id_/" + url,
+                allow_redirects=False,
+            )
+        except requests.exceptions.TooManyRedirects:
+            raise WaybackError("redirect loop (wayback replay fetch)")
+        try:
+            resp.raise_for_status()
+        except Exception as e:
+            raise WaybackError(str(e))
+        #print(resp.url, file=sys.stderr)
+
+        # defensively check that this is actually correct replay based on headers
+        assert "X-Archive-Src" in resp.headers
+        if not datetime in resp.url:
+            raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url))
+
+        redirect_url = resp.headers.get("Location")
+        if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
+            redirect_url = "/".join(redirect_url.split("/")[5:])
+        if redirect_url and redirect_url.startswith("http"):
+            return resp.url
+        else:
+            return None
+
     def lookup_resource(self, start_url, best_mimetype=None):
         """
         Looks in wayback for a resource starting at the URL, following any
@@ -443,15 +484,34 @@ class WaybackClient:
                     cdx=cdx_row,
                 )
             elif 300 <= cdx_row.status_code < 400:
-                resource = self.fetch_petabox(
-                    csize=cdx_row.warc_csize,
-                    offset=cdx_row.warc_offset,
-                    warc_path=cdx_row.warc_path,
-                )
-                assert 300 <= resource.status_code < 400
-                assert resource.location
-                #print(resource, file=sys.stderr)
-                next_url = resource.location
+                if '/' in cdx_row.warc_path:
+                    resource = self.fetch_petabox(
+                        csize=cdx_row.warc_csize,
+                        offset=cdx_row.warc_offset,
+                        warc_path=cdx_row.warc_path,
+                    )
+                    assert 300 <= resource.status_code < 400
+                    assert resource.location
+                    #print(resource, file=sys.stderr)
+                    next_url = resource.location
+                else:
+                    next_url = self.fetch_replay_redirect(
+                        url=cdx_row.url,
+                        datetime=cdx_row.datetime,
+                    )
+                    cdx_row = cdx_partial_from_row(cdx_row)
+                    if not next_url:
+                        print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+                        return ResourceResult(
+                            start_url=start_url,
+                            hit=False,
+                            status="bad-redirect",
+                            terminal_url=cdx_row.url,
+                            terminal_dt=cdx_row.datetime,
+                            terminal_status_code=cdx_row.status_code,
+                            body=None,
+                            cdx=cdx_row,
+                        )
                 if next_url in urls_seen:
                     return ResourceResult(
                         start_url=start_url,
author	Bryan Newbold <bnewbold@archive.org>	2020-01-10 16:08:46 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-10 16:08:46 -0800
commit	1dbc2613d406f3f94bc0ea29621bc81eacc7cea3 (patch)
tree	0a56d7bbf29f74f7cecc3a58f4e92a407d877f9d /python
parent	2866ba252389ac9f3c595e7e7b6c9b4f6cf64663 (diff)
download	sandcrawler-1dbc2613d406f3f94bc0ea29621bc81eacc7cea3.tar.gz sandcrawler-1dbc2613d406f3f94bc0ea29621bc81eacc7cea3.zip