From 66adc96e200b9fbcf8029177c7cee12872a1f563 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 8 Nov 2020 22:19:27 -0800 Subject: html: more adblock --- python/sandcrawler/html_metadata.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index a52d339..367fce4 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -560,6 +560,7 @@ def load_adblock_rules() -> braveblock.Adblocker: "||www.mendeley.com^", "||pbs.twimg.com^", "||badge.dimensions.ai^", + "||recaptcha.net^", # not sure about these CC badges (usually via a redirect) #"||licensebuttons.net^", @@ -570,9 +571,10 @@ def load_adblock_rules() -> braveblock.Adblocker: #"||ajax.googleapis.com^", #"||cdnjs.cloudflare.com^", - # badges, "share" buttons, etc + # badges, "share" buttons, tracking, etc "apis.google.com/js/plusone", "www.google.com/recaptcha/", + "js/_getUACode.js" # PLOS images "/resource/img/icon.*.16.png^", -- cgit v1.2.3