From 0e755b499b1d0e8f3a8abce9032896936db3b188 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 Nov 2019 12:25:45 -0800 Subject: handle wayback fetch redirect loop in ingest code --- python/sandcrawler/ingest.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 4938e12..d3f7043 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -74,8 +74,11 @@ class IngestFileWorker(SandcrawlerWorker): raise SavePageNowError("Failed to find terminal capture from SPNv2") else: return self.spn_client.save_url_now_v1(url) - - resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url']) + + try: + resp = requests.get(WAYBACK_ENDPOINT + cdx['datetime'] + "id_/" + cdx['url']) + except requests.exceptions.TooManyRedirects as e: + raise WaybackError("Redirect loop fetching from wayback (dt: {}, url: {})".format(cdx['datetime'], cdx['url'])) if resp.status_code != cdx['http_status']: raise WaybackError("Got unexpected wayback status (expected {} from CDX, got {})".format(cdx['http_status'], resp.status_code)) body = resp.content -- cgit v1.2.3