diff options
-rw-r--r-- | python/sandcrawler/ia.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 1d997f4..9dad7a0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -577,8 +577,19 @@ class WaybackClient: resolve_revisit=False, ) assert 300 <= resource.status_code < 400 - assert resource.location - #print(resource, file=sys.stderr) + if not resource.location: + print("bad redirect record: {}".format(cdx_row), file=sys.stderr) + return ResourceResult( + start_url=start_url, + hit=False, + status="bad-redirect", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + revisit_cdx=None, + ) next_url = resource.location else: next_url = self.fetch_replay_redirect( |