From fbbc76c8a7e523c029ff6f881b7ab4220131fd6c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 19 Oct 2020 16:43:49 -0700 Subject: ingest: try SPNv2 for no-capture and old failures --- python/sandcrawler/ingest.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index f6929f1..b50bcee 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -175,7 +175,11 @@ class IngestFileWorker(SandcrawlerWorker): #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): # soft404 = True - if self.try_spn2 and (not resource or (not resource.hit and soft404)): + old_failure = False + if resource and not resource.hit and resource.terminal_dt < '20190000000000': + old_failure = True + + if self.try_spn2 and (resource == None or (resource.status == 'no-capture') or soft404 or old_failure): via = "spn2" force_simple_get = 0 for domain in self.spn2_simple_get_domains: -- cgit v1.2.3