diff options
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r-- | python/sandcrawler/ingest.py | 11 |
1 files changed, 11 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 1f693dc..918a832 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -102,6 +102,11 @@ class IngestFileWorker(SandcrawlerWorker): "digital.ucd.ie/", # ireland national historical ] + self.wall_blocklist = [ + # loginwall + "://profile.thieme.de/HTML/sso/ejournals/login.htm", + ] + # these are special-case web domains for which we want SPN2 to not run # a headless browser (brozzler), but instead simply run wget. # the motivation could be to work around browser issues, or in the @@ -330,6 +335,12 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "skip-url-blocklist" return result + # check against known loginwall URLs + for block in self.wall_blocklist: + if block in next_url: + result['status'] = "skip-wall" + return result + # check for popular cookie blocking URL patterns. On successful SPN # crawls, shouldn't see these redirect URLs if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url: |