aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-09-03 18:46:28 -0700
committerBryan Newbold <bnewbold@archive.org>2021-09-03 18:46:33 -0700
commit5afd7e43c93617569df103709795f5f7ec95380d (patch)
treee0d0c42642ffbff924afcf584899afd0b95169c7 /python
parentd749a7a6a1c1d439596c5d053daf904638b4dbc2 (diff)
downloadsandcrawler-5afd7e43c93617569df103709795f5f7ec95380d.tar.gz
sandcrawler-5afd7e43c93617569df103709795f5f7ec95380d.zip
ingest: check URL blocklist again after redirects
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py7
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index fa60e27..630c477 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -140,6 +140,7 @@ class IngestFileWorker(SandcrawlerWorker):
"://osapublishing.org/captcha/",
"/password-login",
"://gateway.isiknowledge.com/",
+ "/login?TARGET=",
]
self.cookie_blocklist = [
@@ -589,6 +590,12 @@ class IngestFileWorker(SandcrawlerWorker):
return result
if resource.terminal_url:
+ for pattern in self.base_url_blocklist:
+ if pattern in resource.terminal_url:
+ result['status'] = 'skip-url-blocklist'
+ return result
+
+ if resource.terminal_url:
for pattern in self.cookie_blocklist:
if pattern in resource.terminal_url:
result['status'] = 'blocked-cookie'