diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 12:55:45 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 12:55:48 -0700 |
commit | 7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b (patch) | |
tree | 551e4a997a2b447e7c4b8e08cbaffd479165dafd | |
parent | f51f7d888d1b30ea874c9656d5cacc84ec7ab8d2 (diff) | |
download | sandcrawler-7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b.tar.gz sandcrawler-7d8d9a8bcc827b136b4dc148f6a2c8c4dd2bbd8b.zip |
ingest: another form of cookie block URL
This still doesn't short-cut CDX lookup chain, because that is all pure
redirects happening in ia.py.
-rw-r--r-- | python/sandcrawler/ingest_file.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 25ae7d2..1626292 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -176,6 +176,8 @@ class IngestFileWorker(SandcrawlerWorker): "/cookieAbsent", "cookieSet=1", "error=cookies_not_supported", + # SPNv2 seems to work (not end up here), but heritrix fails + "://secure.jbs.elsevierhealth.com/", ] self.src_valid_mimetypes = [ |