aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-10-19 16:43:49 -0700
committerBryan Newbold <bnewbold@archive.org>2020-10-19 16:43:49 -0700
commitfbbc76c8a7e523c029ff6f881b7ab4220131fd6c (patch)
treef1fa2d8f3cddcc77dee14cd71257fba5c14ff3a0 /python
parente5c7645010ed1315a43f9cc0cd20ca192b5e8008 (diff)
downloadsandcrawler-fbbc76c8a7e523c029ff6f881b7ab4220131fd6c.tar.gz
sandcrawler-fbbc76c8a7e523c029ff6f881b7ab4220131fd6c.zip
ingest: try SPNv2 for no-capture and old failures
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index f6929f1..b50bcee 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -175,7 +175,11 @@ class IngestFileWorker(SandcrawlerWorker):
#if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'):
# soft404 = True
- if self.try_spn2 and (not resource or (not resource.hit and soft404)):
+ old_failure = False
+ if resource and not resource.hit and resource.terminal_dt < '20190000000000':
+ old_failure = True
+
+ if self.try_spn2 and (resource == None or (resource.status == 'no-capture') or soft404 or old_failure):
via = "spn2"
force_simple_get = 0
for domain in self.spn2_simple_get_domains: