aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-21 17:41:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-21 17:41:41 -0700
commit1263ee33535d232d702324980e7ff69305ed8795 (patch)
treef4ec34e52aec28c42ba432fab2945419a3658d3f /python/sandcrawler/ingest.py
parent071af9a4832dcb24be417de9b658d678056b5bf2 (diff)
downloadsandcrawler-1263ee33535d232d702324980e7ff69305ed8795.tar.gz
sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.zip
ingest PDF extraction updates
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index eb8e256..b610ab4 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -128,8 +128,9 @@ class IngestFileWorker(SandcrawlerWorker):
self.wall_blocklist = [
# loginwall
"://profile.thieme.de/HTML/sso/ejournals/login.htm",
- "://login.bepress.com/"
- "?SAMLRequest="
+ "://login.bepress.com/",
+ "?SAMLRequest=",
+ "://osapublishing.org/captcha/",
]
# these are special-case web domains for which we want SPN2 to not run