diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-08-11 17:22:10 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-08-11 17:22:10 -0700 |
commit | 7e8ff96fb90ddd1c853418a6c405d97afbc45355 (patch) | |
tree | 8efd62adf4a92f44bdb71384af955400102b0f34 /python | |
parent | d5f0602e80847adf3d359a7fd06cc131c07cb6dd (diff) | |
download | sandcrawler-7e8ff96fb90ddd1c853418a6c405d97afbc45355.tar.gz sandcrawler-7e8ff96fb90ddd1c853418a6c405d97afbc45355.zip |
check for simple URL patterns that are usually paywalls or loginwalls
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 11 | ||||
-rw-r--r-- | python/tests/test_ingest.py | 18 |
2 files changed, 29 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 1f693dc..918a832 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -102,6 +102,11 @@ class IngestFileWorker(SandcrawlerWorker): "digital.ucd.ie/", # ireland national historical ] + self.wall_blocklist = [ + # loginwall + "://profile.thieme.de/HTML/sso/ejournals/login.htm", + ] + # these are special-case web domains for which we want SPN2 to not run # a headless browser (brozzler), but instead simply run wget. # the motivation could be to work around browser issues, or in the @@ -330,6 +335,12 @@ class IngestFileWorker(SandcrawlerWorker): result['status'] = "skip-url-blocklist" return result + # check against known loginwall URLs + for block in self.wall_blocklist: + if block in next_url: + result['status'] = "skip-wall" + return result + # check for popular cookie blocking URL patterns. On successful SPN # crawls, shouldn't see these redirect URLs if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url: diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py index c2d6266..46346b7 100644 --- a/python/tests/test_ingest.py +++ b/python/tests/test_ingest.py @@ -173,3 +173,21 @@ def test_ingest_blocklist(ingest_worker): assert resp['status'] == "skip-url-blocklist" assert resp['request'] == request + +@responses.activate +def test_ingest_wall_blocklist(ingest_worker): + + ingest_worker.wall_blocklist = [ + '://test.fatcat.wiki/', + ] + request = { + 'ingest_type': 'pdf', + 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf", + } + + resp = ingest_worker.process(request) + + assert resp['hit'] == False + assert resp['status'] == "skip-wall" + assert resp['request'] == request + |