diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 20:31:03 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-11-13 20:31:03 -0800 |
commit | 889928e49c90776b0203d6612b6d229f9bb7725e (patch) | |
tree | 7857533f4c94d8eca1eb669b3ea9b7ab7e441428 | |
parent | ab0ee1bb7aae902f0e0b3812e8f328f44189fdc5 (diff) | |
download | sandcrawler-889928e49c90776b0203d6612b6d229f9bb7725e.tar.gz sandcrawler-889928e49c90776b0203d6612b6d229f9bb7725e.zip |
handle requests (http) redirect loop from wayback
-rw-r--r-- | python/sandcrawler/ia.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ia.py b/python/sandcrawler/ia.py index 52ff4d5..39227ca 100644 --- a/python/sandcrawler/ia.py +++ b/python/sandcrawler/ia.py @@ -68,7 +68,10 @@ class CdxApiClient: sha1hex=b32_hex(cdx[5]), ) if follow_redirects and cdx['http_status'] in (301, 302): - resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url']) + try: + resp = requests.get(self.wayback_endpoint + cdx['datetime'] + "id_/" + cdx['url']) + except requests.exceptions.TooManyRedirects: + raise CdxApiError("redirect loop (wayback fetch)") next_url = '/'.join(resp.url.split('/')[5:]) if next_url == url: raise CdxApiError("redirect loop (by url)") |