aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-13 20:31:03 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-13 20:31:03 -0800
commit889928e49c90776b0203d6612b6d229f9bb7725e (patch)
tree7857533f4c94d8eca1eb669b3ea9b7ab7e441428
parentab0ee1bb7aae902f0e0b3812e8f328f44189fdc5 (diff)
downloadsandcrawler-889928e49c90776b0203d6612b6d229f9bb7725e.tar.gz
sandcrawler-889928e49c90776b0203d6612b6d229f9bb7725e.zip
handle requests (http) redirect loop from wayback
-rw-r--r--python/sandcrawler/ia.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py
index 52ff4d5..39227ca 100644
--- a/python/sandcrawler/ia.py
+++ b/python/sandcrawler/ia.py
@@ -68,7 +68,10 @@ class CdxApiClient:
sha1hex=b32_hex(cdx[5]),
)
if follow_redirects and cdx['http_status'] in (301, 302):
- resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url'])
+ try:
+ resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url'])
+ except requests.exceptions.TooManyRedirects:
+ raise CdxApiError("redirect loop (wayback fetch)")
next_url = '/'.join(resp.url.split('/')[5:])
if next_url == url:
raise CdxApiError("redirect loop (by url)")