From f51f7d888d1b30ea874c9656d5cacc84ec7ab8d2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 15 Jul 2022 12:54:13 -0700 Subject: HTML ingest: most sub-resource patterns to skip --- python/sandcrawler/html_metadata.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 7b44bfe..180c6a2 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -844,6 +844,9 @@ def load_adblock_rules() -> braveblock.Adblocker: "||pbs.twimg.com^", "||badge.dimensions.ai^", "||recaptcha.net^", + "||tag.imagino.com^", + "||consent.cookiebot.com^", + "||recaptcha.net^", # not sure about these CC badges (usually via a redirect) # "||licensebuttons.net^", # "||i.creativecommons.org^", @@ -857,6 +860,8 @@ def load_adblock_rules() -> braveblock.Adblocker: "js/_getUACode.js" # PLOS images "/resource/img/icon.*.16.png^", + # CAIRN broken tracking tag + "cairn-int.info//about.php?cairn_guest=", ], ) @@ -873,12 +878,19 @@ def _extract_generic( url = node.attrs.get(attr) # special-case a couple meta URI prefixes which don't match with adblock rules skip = False - for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:"]: + for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]: if url and url.startswith(prefix): skip = True break + if url and "/" not in url and "." not in url and " " in url: + # eg: "Ce fichier n'existe pas" + skip = True if skip: continue + if url and url.startswith("https://https://"): + url = url[8:] + elif url and url.startswith("http://http://"): + url = url[7:] if url: # print(url, file=sys.stderr) resources.append(dict(url=url.strip(), type=type_name)) -- cgit v1.2.3