diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-03-24 13:19:02 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-03-24 13:19:04 -0700 |
commit | 11f7c22e1de104918d32274feaea310c11476cc7 (patch) | |
tree | 401eee94894fa4344859bab734e39f34994b6909 | |
parent | 84eeefbd3c55ea31bcf552f9c129c0e1576717ae (diff) | |
download | sandcrawler-11f7c22e1de104918d32274feaea310c11476cc7.tar.gz sandcrawler-11f7c22e1de104918d32274feaea310c11476cc7.zip |
ia: more conservative use of clean_url()
Fixes AttributeError: 'NoneType' object has no attribute 'strip'
Seen in production on the lookup_resource code path.
-rw-r--r-- | python/sandcrawler/ia.py | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index cc176d0..510f23e 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -543,8 +543,8 @@ class WaybackClient: if redirect_url and redirect_url.startswith("https://web.archive.org/web/"): redirect_url = "/".join(redirect_url.split("/")[5:]) #print(redirect_url, file=sys.stderr) - redirect_url = clean_url(redirect_url) if redirect_url and redirect_url.startswith("http"): + redirect_url = clean_url(redirect_url) return redirect_url else: return None @@ -667,13 +667,15 @@ class WaybackClient: next_url = domain_prefix + resource.location else: next_url = resource.location - next_url = clean_url(next_url) + if next_url: + next_url = clean_url(next_url) else: next_url = self.fetch_replay_redirect( url=cdx_row.url, datetime=cdx_row.datetime, ) - next_url = clean_url(next_url) + if next_url: + next_url = clean_url(next_url) cdx_row = cdx_partial_from_row(cdx_row) if not next_url: print("bad redirect record: {}".format(cdx_row), file=sys.stderr) |