From 7e8ff96fb90ddd1c853418a6c405d97afbc45355 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 11 Aug 2020 17:22:10 -0700 Subject: check for simple URL patterns that are usually paywalls or loginwalls --- python/sandcrawler/ingest.py | 11 +++++++++++ python/tests/test_ingest.py | 18 ++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 1f693dc..918a832 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -102,6 +102,11 @@ class IngestFileWorker(SandcrawlerWorker): "digital.ucd.ie/", # ireland national historical ] + self.wall_blocklist = [ + # loginwall + "://profile.thieme.de/HTML/sso/ejournals/login.htm", + ] + # these are special-case web domains for which we want SPN2 to not run # a headless browser (brozzler), but instead simply run wget. # the motivation could be to work around browser issues, or in the @@ -330,6 +335,12 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "skip-url-blocklist" return result + # check against known loginwall URLs + for block in self.wall_blocklist: + if block in next_url: + result['status'] = "skip-wall" + return result + # check for popular cookie blocking URL patterns. On successful SPN # crawls, shouldn't see these redirect URLs if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url: diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index c2d6266..46346b7 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -173,3 +173,21 @@ def test_ingest_blocklist(ingest_worker): assert resp['status'] == "skip-url-blocklist" assert resp['request'] == request + +@responses.activate +def test_ingest_wall_blocklist(ingest_worker): + + ingest_worker.wall_blocklist = [ + '://test.fatcat.wiki/', + ] + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "skip-wall" + assert resp['request'] == request + -- cgit v1.2.3