aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-15 12:55:45 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-15 12:55:48 -0700
commit7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b (patch)
tree551e4a997a2b447e7c4b8e08cbaffd479165dafd
parentf51f7d888d1b30ea874c9656d5cacc84ec7ab8d2 (diff)
downloadsandcrawler-7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b.tar.gz
sandcrawler-7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b.zip
ingest: another form of cookie block URL
This still doesn't short-cut CDX lookup chain, because that is all pure redirects happening in ia.py.
-rw-r--r--python/sandcrawler/ingest_file.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 25ae7d2..1626292 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -176,6 +176,8 @@ class IngestFileWorker(SandcrawlerWorker):
"/cookieAbsent",
"cookieSet=1",
"error=cookies_not_supported",
+ # SPNv2 seems to work (not end up here), but heritrix fails
+ "://secure.jbs.elsevierhealth.com/",
]
self.src_valid_mimetypes = [