From d03eb65e77f7671e996d6d84c88496581357c744 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 18 Feb 2020 19:13:32 -0800 Subject: wayback: on bad redirects, log instead of assert This is a different form of mangled redirect. --- python/sandcrawler/ia.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 1d997f4..9dad7a0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -577,8 +577,19 @@ class WaybackClient: resolve_revisit=False, ) assert 300 <= resource.status_code < 400 - assert resource.location - #print(resource, file=sys.stderr) + if not resource.location: + print("bad redirect record: {}".format(cdx_row), file=sys.stderr) + return ResourceResult( + start_url=start_url, + hit=False, + status="bad-redirect", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + revisit_cdx=None, + ) next_url = resource.location else: next_url = self.fetch_replay_redirect( -- cgit v1.2.3