aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-09-03 12:21:41 -0700
committerBryan Newbold <bnewbold@archive.org>2021-09-03 12:21:41 -0700
commit660fe9eeb3ec2bd0f4ae8c9c62932098c1a7b625 (patch)
tree3a5b641cb9de2edebdde71b5196b1a8c7dd32dd4
parent2ebef36c083b59d158fae7098da49bf972141f1c (diff)
downloadsandcrawler-660fe9eeb3ec2bd0f4ae8c9c62932098c1a7b625.tar.gz
sandcrawler-660fe9eeb3ec2bd0f4ae8c9c62932098c1a7b625.zip
refactor and expand wall/block/cookie URL patterns
-rw-r--r--python/sandcrawler/ingest.py31
-rw-r--r--python/tests/test_ingest.py14
2 files changed, 39 insertions, 6 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 3fa34e3..fa60e27 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -95,9 +95,13 @@ class IngestFileWorker(SandcrawlerWorker):
"://archive.org/",
"://www.archive.org/",
"://web.archive.org/web/",
+
+ # out of scope
"://openlibrary.org/",
"://www.openlibrary.org/",
"://fatcat.wiki/",
+ "://orcid.org/",
+ "://doaj.org/",
# Domain squats
"://bartandjones.com",
@@ -123,6 +127,9 @@ class IngestFileWorker(SandcrawlerWorker):
# DOI prefixes
"://doi.org/10.2307/", # JSTOR; slow and many redirects
+
+ # deprecated domain (doesn't redirect correctly)
+ "://edoc.mpg.de/",
]
self.wall_blocklist = [
@@ -131,6 +138,14 @@ class IngestFileWorker(SandcrawlerWorker):
"://login.bepress.com/",
"?SAMLRequest=",
"://osapublishing.org/captcha/",
+ "/password-login",
+ "://gateway.isiknowledge.com/",
+ ]
+
+ self.cookie_blocklist = [
+ "/cookieAbsent",
+ "cookieSet=1",
+ "error=cookies_not_supported",
]
# these are special-case web domains for which we want SPN2 to not run
@@ -518,14 +533,16 @@ class IngestFileWorker(SandcrawlerWorker):
# check against known loginwall URLs
for block in self.wall_blocklist:
if block in next_url:
+ # TODO: blocked-wall instead of skip-wall
result['status'] = "skip-wall"
return result
# check for popular cookie blocking URL patterns. On successful SPN
# crawls, shouldn't see these redirect URLs
- if '/cookieAbsent' in next_url or 'cookieSet=1' in next_url or 'error=cookies_not_supported' in next_url:
- result['status'] = 'blocked-cookie'
- return result
+ for pattern in self.cookie_blocklist:
+ if pattern in next_url:
+ result['status'] = 'blocked-cookie'
+ return result
try:
resource = self.find_resource(next_url, best_mimetype, force_recrawl=force_recrawl)
@@ -571,9 +588,11 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = resource.status
return result
- if resource.terminal_url and ('/cookieAbsent' in resource.terminal_url or 'cookieSet=1' in resource.terminal_url):
- result['status'] = 'blocked-cookie'
- return result
+ if resource.terminal_url:
+ for pattern in self.cookie_blocklist:
+ if pattern in resource.terminal_url:
+ result['status'] = 'blocked-cookie'
+ return result
if not resource.body:
result['status'] = 'null-body'
diff --git a/python/tests/test_ingest.py b/python/tests/test_ingest.py
index 46346b7..b51f721 100644
--- a/python/tests/test_ingest.py
+++ b/python/tests/test_ingest.py
@@ -191,3 +191,17 @@ def test_ingest_wall_blocklist(ingest_worker):
assert resp['status'] == "skip-wall"
assert resp['request'] == request
+@responses.activate
+def test_ingest_cookie_blocklist(ingest_worker):
+
+ request = {
+ 'ingest_type': 'pdf',
+ 'base_url': "https://test.fatcat.wiki/cookieAbsent",
+ }
+
+ resp = ingest_worker.process(request)
+
+ assert resp['hit'] == False
+ assert resp['status'] == "blocked-cookie"
+ assert resp['request'] == request
+