diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 12:54:13 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-15 12:55:23 -0700 |
commit | f51f7d888d1b30ea874c9656d5cacc84ec7ab8d2 (patch) | |
tree | 356f997a202a9ef2bc1eb049883721c04857c60f | |
parent | b5217753166956eed14cf2c91ec52d883d6a5a56 (diff) | |
download | sandcrawler-f51f7d888d1b30ea874c9656d5cacc84ec7ab8d2.tar.gz sandcrawler-f51f7d888d1b30ea874c9656d5cacc84ec7ab8d2.zip |
HTML ingest: most sub-resource patterns to skip
-rw-r--r-- | python/sandcrawler/html_metadata.py | 14 |
1 files changed, 13 insertions, 1 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 7b44bfe..180c6a2 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -844,6 +844,9 @@ def load_adblock_rules() -> braveblock.Adblocker: "||pbs.twimg.com^", "||badge.dimensions.ai^", "||recaptcha.net^", + "||tag.imagino.com^", + "||consent.cookiebot.com^", + "||recaptcha.net^", # not sure about these CC badges (usually via a redirect) # "||licensebuttons.net^", # "||i.creativecommons.org^", @@ -857,6 +860,8 @@ def load_adblock_rules() -> braveblock.Adblocker: "js/_getUACode.js" # PLOS images "/resource/img/icon.*.16.png^", + # CAIRN broken tracking tag + "cairn-int.info//about.php?cairn_guest=", ], ) @@ -873,12 +878,19 @@ def _extract_generic( url = node.attrs.get(attr) # special-case a couple meta URI prefixes which don't match with adblock rules skip = False - for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:"]: + for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]: if url and url.startswith(prefix): skip = True break + if url and "/" not in url and "." not in url and " " in url: + # eg: "Ce fichier n'existe pas" + skip = True if skip: continue + if url and url.startswith("https://https://"): + url = url[8:] + elif url and url.startswith("http://http://"): + url = url[7:] if url: # print(url, file=sys.stderr) resources.append(dict(url=url.strip(), type=type_name)) |