aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-11-14 12:25:45 -0800
committerBryan Newbold <bnewbold@archive.org>2019-11-14 12:25:45 -0800
commit0e755b499b1d0e8f3a8abce9032896936db3b188 (patch)
tree5fccf7b99cf792160ddad7f761adfbf6b21ff307
parent76b504e3c8d42e34509d75be4a4df69ae9f15a09 (diff)
downloadsandcrawler-0e755b499b1d0e8f3a8abce9032896936db3b188.tar.gz
sandcrawler-0e755b499b1d0e8f3a8abce9032896936db3b188.zip
handle wayback fetch redirect loop in ingest code
-rw-r--r--python/sandcrawler/ingest.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 4938e12..d3f7043 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -74,8 +74,11 @@ class IngestFileWorker(SandcrawlerWorker):
raise SavePageNowError("Failed to find terminal capture from SPNv2")
else:
return self.spn_client.save_url_now_v1(url)
-
- resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url'])
+
+ try:
+ resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url'])
+ except requests.exceptions.TooManyRedirects as e:
+ raise WaybackError("Redirect loop fetching from wayback (dt: {}, url: {})".format(cdx['datetime'], cdx['url']))
if resp.status_code != cdx['http_status']:
raise WaybackError("Got unexpected wayback status (expected {} from CDX, got {})".format(cdx['http_status'], resp.status_code))
body = resp.content