From 7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 15 Jul 2022 12:55:45 -0700 Subject: ingest: another form of cookie block URL This still doesn't short-cut CDX lookup chain, because that is all pure redirects happening in ia.py. --- python/sandcrawler/ingest_file.py | 2 ++ 1 file changed, 2 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 25ae7d2..1626292 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -176,6 +176,8 @@ class IngestFileWorker(SandcrawlerWorker): "/cookieAbsent", "cookieSet=1", "error=cookies_not_supported", + # SPNv2 seems to work (not end up here), but heritrix fails + "://secure.jbs.elsevierhealth.com/", ] self.src_valid_mimetypes = [ -- cgit v1.2.3