aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 22:19:27 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 22:19:27 -0800
commit66adc96e200b9fbcf8029177c7cee12872a1f563 (patch)
tree50d4041255fde72cb3d6ae80ec613fbbec98c379
parentcd09c66c136fceea5872e9601854e48d72dc1dae (diff)
downloadsandcrawler-66adc96e200b9fbcf8029177c7cee12872a1f563.tar.gz
sandcrawler-66adc96e200b9fbcf8029177c7cee12872a1f563.zip
html: more adblock
-rw-r--r--python/sandcrawler/html_metadata.py4
1 files changed, 3 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index a52d339..367fce4 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -560,6 +560,7 @@ def load_adblock_rules() -> braveblock.Adblocker:
"||www.mendeley.com^",
"||pbs.twimg.com^",
"||badge.dimensions.ai^",
+ "||recaptcha.net^",
# not sure about these CC badges (usually via a redirect)
#"||licensebuttons.net^",
@@ -570,9 +571,10 @@ def load_adblock_rules() -> braveblock.Adblocker:
#"||ajax.googleapis.com^",
#"||cdnjs.cloudflare.com^",
- # badges, "share" buttons, etc
+ # badges, "share" buttons, tracking, etc
"apis.google.com/js/plusone",
"www.google.com/recaptcha/",
+ "js/_getUACode.js"
# PLOS images
"/resource/img/icon.*.16.png^",