aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-18 19:13:32 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-18 19:13:34 -0800
commitd03eb65e77f7671e996d6d84c88496581357c744 (patch)
treefe08426c005037465aa41a4dbbb8a375f99dac6a /python
parent4c75d606b385feb29c37d48e0fcf077abf82f92f (diff)
downloadsandcrawler-d03eb65e77f7671e996d6d84c88496581357c744.tar.gz
sandcrawler-d03eb65e77f7671e996d6d84c88496581357c744.zip
wayback: on bad redirects, log instead of assert
This is a different form of mangled redirect.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ia.py15
1 files changed, 13 insertions, 2 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 1d997f4..9dad7a0 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -577,8 +577,19 @@ class WaybackClient:
resolve_revisit=False,
)
assert 300 <= resource.status_code < 400
- assert resource.location
- #print(resource, file=sys.stderr)
+ if not resource.location:
+ print("bad redirect record: {}".format(cdx_row), file=sys.stderr)
+ return ResourceResult(
+ start_url=start_url,
+ hit=False,
+ status="bad-redirect",
+ terminal_url=cdx_row.url,
+ terminal_dt=cdx_row.datetime,
+ terminal_status_code=cdx_row.status_code,
+ body=None,
+ cdx=cdx_row,
+ revisit_cdx=None,
+ )
next_url = resource.location
else:
next_url = self.fetch_replay_redirect(