diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 15:20:39 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 15:24:36 -0700 |
commit | b6a7989c8d2547ad0ea406dbfd4b8a29cc14151d (patch) | |
tree | 11455e570b6c9377f75cec89c32675b5684f8361 | |
parent | 730103121e72ab515979a00341c8a44e362edc71 (diff) | |
download | sandcrawler-b6a7989c8d2547ad0ea406dbfd4b8a29cc14151d.tar.gz sandcrawler-b6a7989c8d2547ad0ea406dbfd4b8a29cc14151d.zip |
ia: more tweaks to delicate code to satisfy type checker
Ran the 'live' wayback tests after this commit as a check, and worked
(once FTP status code behavior change is fixed)
-rw-r--r-- | python/sandcrawler/ia.py | 22 |
1 files changed, 12 insertions, 10 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 1148de2..04a1e3b 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -683,11 +683,10 @@ class WaybackClient: urls_seen = [start_url] for i in range(self.max_redirects + 1): print(" URL: {}".format(next_url), file=sys.stderr) - cdx_row = self.cdx_client.lookup_best(next_url, - best_mimetype=best_mimetype, - closest=closest) - #print(cdx_row, file=sys.stderr) - if not cdx_row: + next_row: Optional[CdxRow] = self.cdx_client.lookup_best( + next_url, best_mimetype=best_mimetype, closest=closest) + #print(next_row, file=sys.stderr) + if not next_row: return ResourceResult( start_url=start_url, hit=False, @@ -700,6 +699,8 @@ class WaybackClient: revisit_cdx=None, ) + cdx_row: CdxRow = next_row + # first try straight-forward redirect situation if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path: resource = self.fetch_petabox( @@ -778,14 +779,15 @@ class WaybackClient: if next_url: next_url = clean_url(next_url) else: - next_url = self.fetch_replay_redirect( + redirect_url = self.fetch_replay_redirect( url=cdx_row.url, datetime=cdx_row.datetime, ) - if next_url: - next_url = clean_url(next_url) - cdx_row = cdx_partial_from_row(cdx_row) - if not next_url: + if redirect_url: + redirect_url = clean_url(redirect_url) + if redirect_url: + next_url = redirect_url + else: print(" bad redirect record: {}".format(cdx_row), file=sys.stderr) return ResourceResult( start_url=start_url, |