From c5733acabe0fa843c71feb14cbb8792296e4b8e2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 29 Oct 2020 14:28:59 -0700 Subject: html: more biblio selectors; resource extraction --- python/sandcrawler/html_metadata.py | 102 ++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index c7b8085..6b1bdef 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,10 +1,12 @@ import datetime from typing import List, Optional, Any +import urllib.parse import dateparser from selectolax.parser import HTMLParser import pydantic +import braveblock # this is a map of metadata keys to CSS selectors @@ -21,6 +23,7 @@ HEAD_META_PATTERNS: Any = { "meta[name='eprints.title']", "meta[name='prism.title']", "meta[name='bepress_citation_title']", + "meta[name='og:title']", "meta[name='dcterms.title']", "meta[name='dc.title']", ], @@ -34,6 +37,7 @@ HEAD_META_PATTERNS: Any = { "meta[name='prism.doi']", "meta[name='bepress_citation_doi']", "meta[name='dc.identifier.doi']", + "meta[name='dc.identifier'][scheme='doi']", ], "pmid": [ "meta[name='citation_pmid']", @@ -70,6 +74,7 @@ HEAD_META_PATTERNS: Any = { "meta[name='citation_online_date']", "meta[name='bepress_citation_online_date']", "meta[itemprop='datePublished']", + "meta[name='article:published']", "meta[name='eprints.datestamp']", "meta[name='eprints.date']", "meta[name='dc.date.created']", @@ -145,12 +150,14 @@ HEAD_META_PATTERNS: Any = { "meta[name='bepress_citation_language']", "meta[name='dcterms.language']", "meta[name='dc.language']", + "meta[name='og:locale']", ], "html_fulltext_url": [ "meta[name='citation_fulltext_html_url']", "meta[name='bepress_citation_fulltext_html_url']", ], "xml_fulltext_url": [ + "meta[name='citation_xml_url']", ], "pdf_fulltext_url": [ "meta[name='citation_pdf_url']", @@ -164,6 +171,7 @@ HEAD_META_LIST_PATTERNS: Any = { "meta[name='bepress_citation_author']", "meta[name='eprints.creators_name']", "meta[name='dcterms.creator']", + "meta[name='article:author']", "meta[name='dc.creator']", "meta[name='dc.contributor']", ], @@ -281,3 +289,97 @@ def html_extract_biblio(doc: HTMLParser) -> Optional[BiblioMetadata]: meta['release_type'] = release_type return BiblioMetadata(**meta) + +def load_adblock_rules() -> braveblock.Adblocker: + """ + TODO: consider blocking very generic assets: + + - favicon.ico + - ://fonts.googleapis.com/css* + - ://widgets.figshare.com/* + - ://crossmark-cdn.crossref.org/widget/* + - ://code.jquery.com/* + => hrm + - ://platform.twitter.com/widgets.js + - ://journals.plos.org/plosone/resource/img/icon.* + """ + return braveblock.Adblocker( + include_easylist=True, + include_easyprivacy=True, + rules=[ + "/favicon.ico^", + "||fonts.googleapis.com^", + "||widgets.figshare.com^", + "||crossmark-cdn.crossref.org^", + "||platform.twitter.com^", + "||verify.nature.com^", + "||s7.addthis.com^", + "||www.mendeley.com^", + "||pbs.twimg.com^", + "||badge.dimensions.ai^", + + # not sure about these CC badges (usually via a redirect) + #"||licensebuttons.net^", + #"||i.creativecommons.org^", + + # Should we skip jquery, or other generic javascript CDNs? + #"||code.jquery.com^", + #"||ajax.googleapis.com^", + #"||cdnjs.cloudflare.com^", + + # PLOS images + "/resource/img/icon.*.16.png^", + ], + ) + + +def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list: + resources = [] + + for node in doc.css(selector): + for attr in attrs: + url = node.attrs.get(attr) + if url: + resources.append(dict(url=url, type=type_name)) + + return resources + + +def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list: + """ + This function tries to find all the important resources in a page. The + presumption is that the HTML document is article fulltext, and we want the + list of all resoures (by URL) necessary to replay the page. + + The returned resource URLs each have a type (script, img, css, etc), and + should be fully-qualified URLs (not relative). + + Adblock filtering is run to remove unwanted resources. + """ + resources = [] + + # select various resource references + resources += _extract_generic(doc, "script", ["src"], "script") + resources += _extract_generic(doc, "link[rel='stylesheet']", ["href"], "stylesheet") + # TODO: srcset and parse + # eg: https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w + resources += _extract_generic(doc, "img", ["src"], "image") + resources += _extract_generic(doc, "audio", ["src"], "audio") + resources += _extract_generic(doc, "video", ["src"], "media") + resources += _extract_generic(doc, "source", ["src"], "media") + resources += _extract_generic(doc, "track", ["src"], "media") + resources += _extract_generic(doc, "iframe", ["src"], "subdocument") + resources += _extract_generic(doc, "embed", ["src"], "media") + + # ensure URLs are absolute + for r in resources: + r['url'] = urllib.parse.urljoin(doc_url, r['url']) + + # filter using adblocker + resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False] + + # remove duplicates + resources = [dict(t) for t in {tuple(d.items()) for d in resources}] + + return resources + -- cgit v1.2.3