aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/ingest.py')
-rw-r--r--python/sandcrawler/ingest.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index fa60e27..630c477 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -140,6 +140,7 @@ class IngestFileWorker(SandcrawlerWorker):
"://osapublishing.org/captcha/",
"/password-login",
"://gateway.isiknowledge.com/",
+ "/login?TARGET=",
]
self.cookie_blocklist = [
@@ -589,6 +590,12 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result['status'] = 'skip-url-blocklist'
+ return result
+
+ if resource.terminal_url:
for pattern in self.cookie_blocklist:
if pattern in resource.terminal_url:
result['status'] = 'blocked-cookie'