aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py11
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 1f693dc..918a832 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -102,6 +102,11 @@ class IngestFileWorker(SandcrawlerWorker):
"digital.ucd.ie/", # ireland national historical
]
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ ]
+
# these are special-case web domains for which we want SPN2 to not run
# a headless browser (brozzler), but instead simply run wget.
# the motivation could be to work around browser issues, or in the
@@ -330,6 +335,12 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "skip-url-blocklist"
return result
+ # check against known loginwall URLs
+ for block in self.wall_blocklist:
+ if block in next_url:
+ result['status'] = "skip-wall"
+ return result
+
# check for popular cookie blocking URL patterns. On successful SPN
# crawls, shouldn't see these redirect URLs
if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url: