aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-08-11 17:22:10 -0700
committerBryan Newbold <bnewbold@archive.org>2020-08-11 17:22:10 -0700
commit7e8ff96fb90ddd1c853418a6c405d97afbc45355 (patch)
tree8efd62adf4a92f44bdb71384af955400102b0f34 /python
parentd5f0602e80847adf3d359a7fd06cc131c07cb6dd (diff)
downloadsandcrawler-7e8ff96fb90ddd1c853418a6c405d97afbc45355.tar.gz
sandcrawler-7e8ff96fb90ddd1c853418a6c405d97afbc45355.zip
check for simple URL patterns that are usually paywalls or loginwalls
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py11
-rw-r--r--python/tests/test_ingest.py18
2 files changed, 29 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 1f693dc..918a832 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -102,6 +102,11 @@ class IngestFileWorker(SandcrawlerWorker):
"digital.ucd.ie/", # ireland national historical
]
+ self.wall_blocklist = [
+ # loginwall
+ "://profile.thieme.de/HTML/sso/ejournals/login.htm",
+ ]
+
# these are special-case web domains for which we want SPN2 to not run
# a headless browser (brozzler), but instead simply run wget.
# the motivation could be to work around browser issues, or in the
@@ -330,6 +335,12 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = "skip-url-blocklist"
return result
+ # check against known loginwall URLs
+ for block in self.wall_blocklist:
+ if block in next_url:
+ result['status'] = "skip-wall"
+ return result
+
# check for popular cookie blocking URL patterns. On successful SPN
# crawls, shouldn't see these redirect URLs
if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url:
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index c2d6266..46346b7 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -173,3 +173,21 @@ def test_ingest_blocklist(ingest_worker):
assert resp['status'] == "skip-url-blocklist"
assert resp['request'] == request
+
+@responses.activate
+def test_ingest_wall_blocklist(ingest_worker):
+
+ ingest_worker.wall_blocklist = [
+ '://test.fatcat.wiki/',
+ ]
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/asdfasdf.pdf",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "skip-wall"
+ assert resp['request'] == request
+