aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 15:20:39 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 15:24:36 -0700
commitb6a7989c8d2547ad0ea406dbfd4b8a29cc14151d (patch)
tree11455e570b6c9377f75cec89c32675b5684f8361 /python
parent730103121e72ab515979a00341c8a44e362edc71 (diff)
downloadsandcrawler-b6a7989c8d2547ad0ea406dbfd4b8a29cc14151d.tar.gz
sandcrawler-b6a7989c8d2547ad0ea406dbfd4b8a29cc14151d.zip
ia: more tweaks to delicate code to satisfy type checker
Ran the 'live' wayback tests after this commit as a check, and worked (once FTP status code behavior change is fixed)
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py22
1 files changed, 12 insertions, 10 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 1148de2..04a1e3b 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -683,11 +683,10 @@ class WaybackClient:
urls_seen = [start_url]
for i in range(self.max_redirects + 1):
print(" URL: {}".format(next_url), file=sys.stderr)
- cdx_row = self.cdx_client.lookup_best(next_url,
- best_mimetype=best_mimetype,
- closest=closest)
- #print(cdx_row, file=sys.stderr)
- if not cdx_row:
+ next_row: Optional[CdxRow] = self.cdx_client.lookup_best(
+ next_url, best_mimetype=best_mimetype, closest=closest)
+ #print(next_row, file=sys.stderr)
+ if not next_row:
return ResourceResult(
start_url=start_url,
hit=False,
@@ -700,6 +699,8 @@ class WaybackClient:
revisit_cdx=None,
)
+ cdx_row: CdxRow = next_row
+
# first try straight-forward redirect situation
if cdx_row.mimetype == "warc/revisit" and '/' in cdx_row.warc_path:
resource = self.fetch_petabox(
@@ -778,14 +779,15 @@ class WaybackClient:
if next_url:
next_url = clean_url(next_url)
else:
- next_url = self.fetch_replay_redirect(
+ redirect_url = self.fetch_replay_redirect(
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- if next_url:
- next_url = clean_url(next_url)
- cdx_row = cdx_partial_from_row(cdx_row)
- if not next_url:
+ if redirect_url:
+ redirect_url = clean_url(redirect_url)
+ if redirect_url:
+ next_url = redirect_url
+ else:
print(" bad redirect record: {}".format(cdx_row), file=sys.stderr)
return ResourceResult(
start_url=start_url,