diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 22:19:27 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 22:19:27 -0800 |
commit | 66adc96e200b9fbcf8029177c7cee12872a1f563 (patch) | |
tree | 50d4041255fde72cb3d6ae80ec613fbbec98c379 /python | |
parent | cd09c66c136fceea5872e9601854e48d72dc1dae (diff) | |
download | sandcrawler-66adc96e200b9fbcf8029177c7cee12872a1f563.tar.gz sandcrawler-66adc96e200b9fbcf8029177c7cee12872a1f563.zip |
html: more adblock
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 4 |
1 files changed, 3 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index a52d339..367fce4 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -560,6 +560,7 @@ def load_adblock_rules() -> braveblock.Adblocker: "||www.mendeley.com^", "||pbs.twimg.com^", "||badge.dimensions.ai^", + "||recaptcha.net^", # not sure about these CC badges (usually via a redirect) #"||licensebuttons.net^", @@ -570,9 +571,10 @@ def load_adblock_rules() -> braveblock.Adblocker: #"||ajax.googleapis.com^", #"||cdnjs.cloudflare.com^", - # badges, "share" buttons, etc + # badges, "share" buttons, tracking, etc "apis.google.com/js/plusone", "www.google.com/recaptcha/", + "js/_getUACode.js" # PLOS images "/resource/img/icon.*.16.png^", |