diff options
-rw-r--r-- | python/sandcrawler/ingest.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index fa60e27..630c477 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -140,6 +140,7 @@ class IngestFileWorker(SandcrawlerWorker): "://osapublishing.org/captcha/", "/password-login", "://gateway.isiknowledge.com/", + "/login?TARGET=", ] self.cookie_blocklist = [ @@ -589,6 +590,12 @@ class IngestFileWorker(SandcrawlerWorker): return result if resource.terminal_url: + for pattern in self.base_url_blocklist: + if pattern in resource.terminal_url: + result['status'] = 'skip-url-blocklist' + return result + + if resource.terminal_url: for pattern in self.cookie_blocklist: if pattern in resource.terminal_url: result['status'] = 'blocked-cookie' |