diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-05-21 17:41:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-05-21 17:41:41 -0700 |
commit | 1263ee33535d232d702324980e7ff69305ed8795 (patch) | |
tree | f4ec34e52aec28c42ba432fab2945419a3658d3f /python/sandcrawler/ingest.py | |
parent | 071af9a4832dcb24be417de9b658d678056b5bf2 (diff) | |
download | sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.tar.gz sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.zip |
ingest PDF extraction updates
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r-- | python/sandcrawler/ingest.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index eb8e256..b610ab4 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -128,8 +128,9 @@ class IngestFileWorker(SandcrawlerWorker): self.wall_blocklist = [ # loginwall "://profile.thieme.de/HTML/sso/ejournals/login.htm", - "://login.bepress.com/" - "?SAMLRequest=" + "://login.bepress.com/", + "?SAMLRequest=", + "://osapublishing.org/captcha/", ] # these are special-case web domains for which we want SPN2 to not run |