diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 16:43:49 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-10-19 16:43:49 -0700 |
commit | fbbc76c8a7e523c029ff6f881b7ab4220131fd6c (patch) | |
tree | f1fa2d8f3cddcc77dee14cd71257fba5c14ff3a0 | |
parent | e5c7645010ed1315a43f9cc0cd20ca192b5e8008 (diff) | |
download | sandcrawler-fbbc76c8a7e523c029ff6f881b7ab4220131fd6c.tar.gz sandcrawler-fbbc76c8a7e523c029ff6f881b7ab4220131fd6c.zip |
ingest: try SPNv2 for no-capture and old failures
-rw-r--r-- | python/sandcrawler/ingest.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index f6929f1..b50bcee 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -175,7 +175,11 @@ class IngestFileWorker(SandcrawlerWorker): #if resource and resource.hit and resource.terminal_url.endswith('/cookieAbsent'): # soft404 = True - if self.try_spn2 and (not resource or (not resource.hit and soft404)): + old_failure = False + if resource and not resource.hit and resource.terminal_dt < '20190000000000': + old_failure = True + + if self.try_spn2 and (resource == None or (resource.status == 'no-capture') or soft404 or old_failure): via = "spn2" force_simple_get = 0 for domain in self.spn2_simple_get_domains: |