aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-11 19:49:08 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-11 19:49:08 -0700
commit8782f83631365bffdd6a54fa436f5c477386fd1f (patch)
tree6e163fd5dc1da579758d2316da63c9e6c4c4fa8f /python
parentbb8d4a3591826af3cb15f32404b98eb7995d005b (diff)
downloadsandcrawler-8782f83631365bffdd6a54fa436f5c477386fd1f.tar.gz
sandcrawler-8782f83631365bffdd6a54fa436f5c477386fd1f.zip
additional loginwall patterns
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index d910665..c45437d 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -105,6 +105,8 @@ class IngestFileWorker(SandcrawlerWorker):
self.wall_blocklist = [
# loginwall
"://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ "://login.bepress.com/"
+ "?SAMLRequest="
]
# these are special-case web domains for which we want SPN2 to not run