diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 16:08:46 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-10 16:08:46 -0800 |
commit | 1dbc2613d406f3f94bc0ea29621bc81eacc7cea3 (patch) | |
tree | 0a56d7bbf29f74f7cecc3a58f4e92a407d877f9d /python | |
parent | 2866ba252389ac9f3c595e7e7b6c9b4f6cf64663 (diff) | |
download | sandcrawler-1dbc2613d406f3f94bc0ea29621bc81eacc7cea3.tar.gz sandcrawler-1dbc2613d406f3f94bc0ea29621bc81eacc7cea3.zip |
add support for redirect lookups from replay
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 78 |
1 files changed, 69 insertions, 9 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index cbf901a..da4d2b7 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -379,6 +379,47 @@ class WaybackClient: ) return resp.content + def fetch_replay_redirect(self, url, datetime): + """ + Fetches an HTTP 3xx redirect Location from wayback via the replay interface + (web.archive.org) instead of petabox. + + Intended for use with SPN2 requests, where request body has not ended + up in petabox yet. + + Returns None if response is found, but couldn't find redirect. + """ + + # defensively check datetime format + assert len(datetime) == 14 + assert datetime.isdigit() + + try: + resp = requests.get( + self.wayback_endpoint + datetime + "id_/" + url, + allow_redirects=False, + ) + except requests.exceptions.TooManyRedirects: + raise WaybackError("redirect loop (wayback replay fetch)") + try: + resp.raise_for_status() + except Exception as e: + raise WaybackError(str(e)) + #print(resp.url, file=sys.stderr) + + # defensively check that this is actually correct replay based on headers + assert "X-Archive-Src" in resp.headers + if not datetime in resp.url: + raise WaybackError("didn't get exact reply (redirect?) datetime:{} got:{}".format(datetime, resp.url)) + + redirect_url = resp.headers.get("Location") + if redirect_url and redirect_url.startswith("https://web.archive.org/web/"): + redirect_url = "/".join(redirect_url.split("/")[5:]) + if redirect_url and redirect_url.startswith("http"): + return resp.url + else: + return None + def lookup_resource(self, start_url, best_mimetype=None): """ Looks in wayback for a resource starting at the URL, following any @@ -443,15 +484,34 @@ class WaybackClient: cdx=cdx_row, ) elif 300 <= cdx_row.status_code < 400: - resource = self.fetch_petabox( - csize=cdx_row.warc_csize, - offset=cdx_row.warc_offset, - warc_path=cdx_row.warc_path, - ) - assert 300 <= resource.status_code < 400 - assert resource.location - #print(resource, file=sys.stderr) - next_url = resource.location + if '/' in cdx_row.warc_path: + resource = self.fetch_petabox( + csize=cdx_row.warc_csize, + offset=cdx_row.warc_offset, + warc_path=cdx_row.warc_path, + ) + assert 300 <= resource.status_code < 400 + assert resource.location + #print(resource, file=sys.stderr) + next_url = resource.location + else: + next_url = self.fetch_replay_redirect( + url=cdx_row.url, + datetime=cdx_row.datetime, + ) + cdx_row = cdx_partial_from_row(cdx_row) + if not next_url: + print("bad redirect record: {}".format(cdx_row), file=sys.stderr) + return ResourceResult( + start_url=start_url, + hit=False, + status="bad-redirect", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + ) if next_url in urls_seen: return ResourceResult( start_url=start_url, |