diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 19:13:32 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-02-18 19:13:34 -0800 |
commit | d03eb65e77f7671e996d6d84c88496581357c744 (patch) | |
tree | fe08426c005037465aa41a4dbbb8a375f99dac6a /python | |
parent | 4c75d606b385feb29c37d48e0fcf077abf82f92f (diff) | |
download | sandcrawler-d03eb65e77f7671e996d6d84c88496581357c744.tar.gz sandcrawler-d03eb65e77f7671e996d6d84c88496581357c744.zip |
wayback: on bad redirects, log instead of assert
This is a different form of mangled redirect.
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ia.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 1d997f4..9dad7a0 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -577,8 +577,19 @@ class WaybackClient: resolve_revisit=False, ) assert 300 <= resource.status_code < 400 - assert resource.location - #print(resource, file=sys.stderr) + if not resource.location: + print("bad redirect record: {}".format(cdx_row), file=sys.stderr) + return ResourceResult( + start_url=start_url, + hit=False, + status="bad-redirect", + terminal_url=cdx_row.url, + terminal_dt=cdx_row.datetime, + terminal_status_code=cdx_row.status_code, + body=None, + cdx=cdx_row, + revisit_cdx=None, + ) next_url = resource.location else: next_url = self.fetch_replay_redirect( |