aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ia.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-03-24 13:19:02 -0700
committerBryan Newbold <bnewbold@archive.org>2020-03-24 13:19:04 -0700
commit11f7c22e1de104918d32274feaea310c11476cc7 (patch)
tree401eee94894fa4344859bab734e39f34994b6909 /python/sandcrawler/ia.py
parent84eeefbd3c55ea31bcf552f9c129c0e1576717ae (diff)
downloadsandcrawler-11f7c22e1de104918d32274feaea310c11476cc7.tar.gz
sandcrawler-11f7c22e1de104918d32274feaea310c11476cc7.zip
ia: more conservative use of clean_url()
Fixes AttributeError: 'NoneType' object has no attribute 'strip' Seen in production on the lookup_resource code path.
Diffstat (limited to 'python/sandcrawler/ia.py')
-rw-r--r--python/sandcrawler/ia.py8
1 files changed, 5 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index cc176d0..510f23e 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -543,8 +543,8 @@ class WaybackClient:
if redirect_url and redirect_url.startswith("https://web.archive.org/web/"):
redirect_url = "/".join(redirect_url.split("/")[5:])
#print(redirect_url, file=sys.stderr)
- redirect_url = clean_url(redirect_url)
if redirect_url and redirect_url.startswith("http"):
+ redirect_url = clean_url(redirect_url)
return redirect_url
else:
return None
@@ -667,13 +667,15 @@ class WaybackClient:
next_url = domain_prefix + resource.location
else:
next_url = resource.location
- next_url = clean_url(next_url)
+ if next_url:
+ next_url = clean_url(next_url)
else:
next_url = self.fetch_replay_redirect(
url=cdx_row.url,
datetime=cdx_row.datetime,
)
- next_url = clean_url(next_url)
+ if next_url:
+ next_url = clean_url(next_url)
cdx_row = cdx_partial_from_row(cdx_row)
if not next_url:
print("bad redirect record: {}".format(cdx_row), file=sys.stderr)