diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-09-03 18:46:28 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-09-03 18:46:33 -0700 |
commit | 5afd7e43c93617569df103709795f5f7ec95380d (patch) | |
tree | e0d0c42642ffbff924afcf584899afd0b95169c7 | |
parent | d749a7a6a1c1d439596c5d053daf904638b4dbc2 (diff) | |
download | sandcrawler-5afd7e43c93617569df103709795f5f7ec95380d.tar.gz sandcrawler-5afd7e43c93617569df103709795f5f7ec95380d.zip |
ingest: check URL blocklist again after redirects
-rw-r--r-- | python/sandcrawler/ingest.py | 7 |
1 files changed, 7 insertions, 0 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index fa60e27..630c477 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -140,6 +140,7 @@ class IngestFileWorker(SandcrawlerWorker): "://osapublishing.org/captcha/", "/password-login", "://gateway.isiknowledge.com/", + "/login?TARGET=", ] self.cookie_blocklist = [ @@ -589,6 +590,12 @@ class IngestFileWorker(SandcrawlerWorker): return result if resource.terminal_url: + for pattern in self.base_url_blocklist: + if pattern in resource.terminal_url: + result['status'] = 'skip-url-blocklist' + return result + + if resource.terminal_url: for pattern in self.cookie_blocklist: if pattern in resource.terminal_url: result['status'] = 'blocked-cookie' |