aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-19 16:27:01 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-19 16:27:03 -0700
commitb388be5aff1b074b82a5382c5267a8ab4c9e615b (patch)
tree6465c1318c5acd74586de1b5a5a6fd8334aa11f8
parent03be839b252f8c40d18abbfd88374c6c70ffb584 (diff)
downloadsandcrawler-b388be5aff1b074b82a5382c5267a8ab4c9e615b.tar.gz
sandcrawler-b388be5aff1b074b82a5382c5267a8ab4c9e615b.zip
ingest: disable soft404 and non-hit SPNv2 retries
This might have made sense at some point, but I had forgotten about this code path and it makes no sense now. Has been resulting in very many extraneous SPN requests.
-rw-r--r--python/sandcrawler/ingest.py9
1 files changed, 5 insertions, 4 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 5ab7e13..f6929f1 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -169,12 +169,13 @@ class IngestFileWorker(SandcrawlerWorker):
resource = self.wayback_client.lookup_resource(url, best_mimetype)
# check for "soft 404" conditions, where we should retry with live SPNv2
- # TODO: could refactor these into the resource fetch things themselves?
soft404 = False
- if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
- soft404 = True
+ # NOTE: these are often not working with SPNv2 either, so disabling. If
+ # we really want to try again, should do force-recrawl
+ #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
+ # soft404 = True
- if self.try_spn2 and (not resource or not resource.hit or soft404):
+ if self.try_spn2 and (not resource or (not resource.hit and soft404)):
via = "spn2"
force_simple_get = 0
for domain in self.spn2_simple_get_domains: