From b388be5aff1b074b82a5382c5267a8ab4c9e615b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 19 Oct 2020 16:27:01 -0700 Subject: ingest: disable soft404 and non-hit SPNv2 retries This might have made sense at some point, but I had forgotten about this code path and it makes no sense now. Has been resulting in very many extraneous SPN requests. --- python/sandcrawler/ingest.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 5ab7e13..f6929f1 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -169,12 +169,13 @@ class IngestFileWorker(SandcrawlerWorker): resource = self.wayback_client.lookup_resource(url, best_mimetype) # check for "soft 404" conditions, where we should retry with live SPNv2 - # TODO: could refactor these into the resource fetch things themselves? soft404 = False - if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): - soft404 = True + # NOTE: these are often not working with SPNv2 either, so disabling. If + # we really want to try again, should do force-recrawl + #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): + # soft404 = True - if self.try_spn2 and (not resource or not resource.hit or soft404): + if self.try_spn2 and (not resource or (not resource.hit and soft404)): via = "spn2" force_simple_get = 0 for domain in self.spn2_simple_get_domains: -- cgit v1.2.3