diff options
-rw-r--r-- | python/sandcrawler/ia.py | 22 |
1 files changed, 12 insertions, 10 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 1148de2..04a1e3b 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -683,11 +683,10 @@ class WaybackClient: urls_seen = [start_url] for i in range(self.max_redirects + 1): print(" URL: {}".format(next_url), file=sys.stderr) - cdx_row = self.cdx_client.lookup_best(next_url, - best_mimetype=best_mimetype, - closest=closest) - #print(cdx_row, file=sys.stderr) - if not cdx_row: + next_row: Optional[CdxRow] = self.cdx_client.lookup_best( + next_url, best_mimetype=best_mimetype, closest=closest) + #print(next_row, file=sys.stderr) + if not next_row: return ResourceResult( start_url=start_url, hit=False, @@ -700,6 +699,8 @@ class WaybackClient: revisit_cdx=None, ) + cdx_row: CdxRow = next_row + # first try straight-forward redirect situation if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path: resource = self.fetch_petabox( @@ -778,14 +779,15 @@ class WaybackClient: if next_url: next_url = clean_url(next_url) else: - next_url = self.fetch_replay_redirect( + redirect_url = self.fetch_replay_redirect( url=cdx_row.url, datetime=cdx_row.datetime, ) - if next_url: - next_url = clean_url(next_url) - cdx_row = cdx_partial_from_row(cdx_row) - if not next_url: + if redirect_url: + redirect_url = clean_url(redirect_url) + if redirect_url: + next_url = redirect_url + else: print(" bad redirect record: {}".format(cdx_row), file=sys.stderr) return ResourceResult( start_url=start_url, |