diff options
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 857 |
1 files changed, 857 insertions, 0 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py new file mode 100644 index 0000000..1a328ef --- /dev/null +++ b/python/sandcrawler/html_metadata.py @@ -0,0 +1,857 @@ + +import sys +import datetime +from typing import List, Optional, Any, Tuple, Dict +import urllib.parse + +import dateparser +from selectolax.parser import HTMLParser +import pydantic +import braveblock + +from sandcrawler.misc import url_fuzzy_equal + + +# this is a map of metadata keys to CSS selectors +# sources for this list include: +# - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) +# - inspection of actual publisher HTML +# - http://div.div1.com.au/div-thoughts/div-commentaries/66-div-commentary-metadata +# - "HTML meta tags used by journal articles" +# https://gist.github.com/hubgit/5985963 +# order of these are mostly by preference/quality (best option first), though +# also/sometimes re-ordered for lookup efficiency (lookup stops after first +# match) +HEAD_META_PATTERNS: Any = { + "title": [ + "meta[name='citation_title']", + "meta[name='eprints.title']", + "meta[name='prism.title']", + "meta[name='bepress_citation_title']", + "meta[name='og:title']", + "meta[name='dcterms.title']", + "meta[name='dc.title']", + ], + "subtitle": [ + "meta[name='prism.subtitle']", + ], + "doi": [ + "meta[name='citation_doi']", + "meta[name='DOI']", + "meta[id='DOI']", + "meta[name='prism.doi']", + "meta[name='bepress_citation_doi']", + "meta[name='dc.identifier.doi']", + "meta[name='dc.identifier'][scheme='doi']", + ], + "pmid": [ + "meta[name='citation_pmid']", + ], + "abstract": [ + "meta[name='citation_abstract']", + "meta[name='bepress_citation_abstract']", + "meta[name='eprints.abstract']", + "meta[name='dcterms.abstract']", + "meta[name='prism.teaser']", + "meta[name='dc.description']", + "meta[name='og:description']", + ], + "container_name": [ + "meta[name='citation_journal_title']", + "meta[name='bepress_citation_journal_title']", + "meta[name='citation_conference_title']", + "meta[name='bepress_citation_conference_title']", + "meta[name='prism.publicationName']", + "meta[name='eprints.publication']", + "meta[name='dc.relation.ispartof']", + "meta[name='dc.source']", + "meta[property='og:site_name']", + ], + "container_abbrev": [ + "meta[name='citation_journal_abbrev']", + ], + "raw_date": [ + "meta[name='citation_publication_date']", + "meta[name='bepress_citation_publication_date']", + "meta[name='prism.publicationDate']", + "meta[name='citation_date']", + "meta[name='bepress_citation_date']", + "meta[name='citation_online_date']", + "meta[name='bepress_citation_online_date']", + "meta[itemprop='datePublished']", + "meta[name='article:published']", + "meta[name='eprints.datestamp']", + "meta[name='eprints.date']", + "meta[name='dc.date.created']", + "meta[name='dc.issued']", + "meta[name='dcterms.date']", + "meta[name='dc.date']", + ], + "release_year": [ + "meta[itemprop='citation_year']", + "meta[itemprop='prism:copyrightYear']", + ], + "first_page": [ + "meta[name='citation_firstpage']", + "meta[name='bepress_citation_firstpage']", + "meta[name='prism.startingPage']", + "meta[name='dc.citation.spage']", + ], + "last_page": [ + "meta[name='citation_lastpage']", + "meta[name='bepress_citation_lastpage']", + "meta[name='prism.endingPage']", + "meta[name='dc.citation.epage']", + ], + "issue": [ + "meta[name='citation_issue']", + "meta[name='bepress_citation_issue']", + "meta[name='prism.issueIdentifier']", + "meta[name='dc.citation.issue']", + ], + "volume": [ + "meta[name='citation_volume']", + "meta[name='bepress_citation_volume']", + "meta[name='prism.volume']", + "meta[name='dc.citation.volume']", + ], + "number": [ + "meta[name='citation_technical_report_number']", + "meta[name='bepress_citation_technical_report_number']", + "meta[name='citation_number']", + "meta[name='bepress_citation_number']", + "meta[name='prism.number']", + ], + "container_issn": [ + "meta[name='citation_issn']", + "meta[name='bepress_citation_issn']", + "meta[name='prism.issn']", + "meta[name='prism.eIssn']", + "meta[name='eprints.issn']", + "meta[name='dc.source.issn']", + ], + "isbn": [ + "meta[name='citation_isbn']", + "meta[name='bepress_citation_isbn']", + "meta[name='prism.isbn']", + ], + "publisher": [ + "meta[name='citation_publisher']", + "meta[name='bepress_citation_publisher']", + "meta[name='eprints.publisher']", + "meta[name='citation_technical_report_institution']", + "meta[name='dcterms.publisher']", + "meta[name='dc.publisher']", + ], + "raw_release_type": [ + "meta[name='citation_article_type']", + "meta[name='bepress_citation_article_type']", + "meta[name='prism.contentType']", + "meta[name='eprints.type']", + "meta[name='dc.type']", + ], + "lang": [ + "meta[name='citation_language']", + "meta[name='bepress_citation_language']", + "meta[name='dcterms.language']", + "meta[name='dc.language']", + "meta[name='og:locale']", + ], +} + +HEAD_META_LIST_PATTERNS: Any = { + "contrib_names": [ + "meta[name='citation_author']", + "meta[name='bepress_citation_author']", + "meta[name='eprints.creators_name']", + "meta[name='dcterms.creator']", + "meta[name='article:author']", + "meta[name='dc.creator']", + "meta[name='dc.contributor']", + ], + # TODO: citation_author_institution + "raw_references": [ + "meta[name='citation_reference']", + ], + "raw_identifiers": [ + "meta[name='eprints.id_number']", + "meta[name='dcterms.identifier']", + "meta[name='dc.identifier']", + ], +} + +XML_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "meta[name='citation_xml_url']", + "attr": "content", + "technique": "citation_xml_url", + }, + { + "selector": "meta[name='fulltext_xml']", + "attr": "content", + "technique": "fulltext_xml", + }, + { + "selector": "link[rel='alternate'][type='application/xml']", + "attr": "href", + "technique": "alternate link", + }, + { + "selector": "link[rel='alternate'][type='text/xml']", + "attr": "href", + "technique": "alternate link", + }, + { + "in_doc_url": "scielo", + "in_fulltext_url": "articleXML", + "selector": "a[target='xml']", + "attr": "href", + "technique": "SciElo XML link", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "viewXML", + "selector": "a[class='obj_galley_link']", + "attr": "href", + "technique": "OJS Gallery XML link", + }, + { + "in_fulltext_url": "/download/xml/", + "selector": "a[title='XML']", + "attr": "href", + "technique": "ARPHA XML link", + "example_page": "https://zookeys.pensoft.net/article/26391", + }, + { + "in_doc_url": "frontiersin.org/", + "in_fulltext_url": "xml", + "selector": "a.download-files-nlm", + "attr": "href", + "technique": "XML (NLM) download link (frontiersin.org)", + "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full", + }, +] + +HTML_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "meta[name='citation_fulltext_html_url']", + "attr": "content", + "technique": "citation_fulltext_html_url", + }, + { + "selector": "link[rel='alternate'][type='text/html']", + "attr": "href", + "technique": "alternate link", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "inline=1", + "selector": "iframe[name='htmlFrame']", + "attr": "src", + "technique": "OJS HTML iframe", + }, + { + "in_doc_url": "dovepress.com", + "in_fulltext_url": "-fulltext-", + "selector": "a[id='view-full-text']", + "attr": "href", + "technique": "dovepress fulltext link", + }, +] + +COMPONENT_FULLTEXT_PATTERNS: List[dict] = [ + { + "in_doc_url": "pensoft.net/article/", # also /element/ + "in_fulltext_url": "/download/fig/", + "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small", + "attr": "href", + "technique": "Active figure download link (zookeys)", + "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/", + }, +] + +# This is a database of matching patterns. Most of these discovered by hand, +# looking at OA journal content that failed to craw/ingest. +PDF_FULLTEXT_PATTERNS: List[dict] = [ + { + "selector": "head meta[name='citation_pdf_url']", + "attr": "content", + "technique": "citation_pdf_url", + }, + { + "selector": "head meta[name='bepress_citation_pdf_url']", + "attr": "content", + "technique": "citation_pdf_url", + }, + { + "in_doc_url": "journals.lww.com", + "selector": "head meta[name='wkhealth_pdf_url']", + "attr": "content", + "technique": "wkhealth_pdf_url", + "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx", + }, + { + "selector": "head meta[propery='citation_pdf_url']", + "attr": "content", + "technique": "citation_pdf_url", + # eg, researchgate + }, + { + "selector": "head meta[name='eprints.document_url']", + "attr": "content", + "technique": "citation_pdf_url (property)", + }, + { + "in_doc_url": "/doi/10.", + "in_fulltext_url": "/doi/pdf/", + "selector": "a.show-pdf", + "attr": "href", + "technique": "SAGE/UTP show-pdflink", + "example_page": "https://journals.sagepub.com/doi/10.1177/2309499019888836", + # also http://utpjournals.press/doi/10.3138/cjh.ach.54.1-2.05 + }, + { + "in_doc_url": "/doi/10.", + "in_fulltext_url": "/doi/pdf/", + "selector": "a[title='PDF']", + "attr": "href", + "technique": "title=PDF link", + "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379", + }, + { + "in_doc_url": "/article/view/", + "selector": "a#pdfDownloadLink", + "attr": "href", + "technique": "pdfDownloadLink link", + "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336", + }, + { + "in_fulltext_url": "/pdf/", + "selector": "a.show-pdf", + "attr": "href", + "technique": "SAGE PDF link", + "example_page": "http://journals.sagepub.com/doi/pdf/10.1177/2309499019888836", + }, + { + "in_doc_url": "://elifesciences.org/articles/", + "in_fulltext_url": "/download/", + "selector": "a[data-download-type='pdf-article']", + "attr": "href", + "technique": "eLife PDF link", + "example_page": "https://elifesciences.org/articles/59841", + }, + { + "in_doc_url": "://www.jcancer.org/", + "in_fulltext_url": ".pdf", + "selector": ".divboxright a.text-button", + "attr": "href", + "technique": "jcancer PDF link", + "example_page": "https://www.jcancer.org/v10p4038.htm", + }, + { + "in_doc_url": "://www.tandfonline.com/doi/full/10.", + "in_fulltext_url": "/pdf/", + "selector": "a.show-pdf", + "attr": "href", + "technique": "t+f show-pdf link", + "example_page": "https://www.tandfonline.com/doi/full/10.1080/19491247.2019.1682234", + }, + { + "in_doc_url": "article_id=", + "in_fulltext_url": "download.php", + "selector": "a.file.pdf", + "attr": "href", + "technique": "pdf file link", + "example_page": "http://journals.tsu.ru/psychology/&journal_page=archive&id=1815&article_id=40405", + }, + { + "in_doc_url": "/content/10.", + "in_fulltext_url": "pdf", + "selector": "a.pdf[title='Download']", + "attr": "href", + "technique": "pdf file link", + "example_page": "https://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.11.2000230", + }, + { + "selector": "embed[type='application/pdf']", + "attr": "src", + "technique": "PDF embed", + "example_page": "http://www.jasstudies.com/DergiTamDetay.aspx?ID=3401", + }, + { + "in_doc_url": "/html/", + "in_fulltext_url": "create_pdf", + "selector": ".AbsPdfFigTab img[src='images/pdf-icon.jpg'] + a", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://www.aed.org.cn/nyzyyhjxb/html/2018/4/20180408.htm", + }, + { + "in_doc_url": "/archive-detail/", + "in_fulltext_url": ".pdf", + "selector": ".contact-list a.download-pdf", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439", + }, + { + "in_doc_url": "degruyter.com/document/", + "in_fulltext_url": "/pdf", + "selector": "a.downloadPdf", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html", + }, + { + "in_doc_url": "repositorio.unicamp.br/handle/", + "in_fulltext_url": "/bitstream/", + "selector": "table.panel-body a[target='_blank']", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750", + }, + { + "in_doc_url": "dlc.library.columbia.edu/durst/", + "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]", + "attr": "href", + "technique": "Access URL link", + "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9", + }, + { + "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi", + "in_fulltext_url": "pdf", + "selector": "p a[href]", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29", + }, + { + "in_doc_url": "preprints.jmir.org/preprint/", + "selector": "a.pdf-download-button", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://preprints.jmir.org/preprint/22556", + }, + { + "in_doc_url": "bloomsburycollections.com/", + "in_fulltext_url": "pdf", + "selector": "li.download-item a[href]", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries", + }, + { + "in_doc_url": "emerald.com/insight/content/", + "in_fulltext_url": "pdf", + "selector": "a.intent_pdf_link", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html", + }, + { + "in_doc_url": "ingentaconnect.com/content/", + "in_fulltext_url": "pdf", + "selector": "a.pdf[data-popup]", + "attr": "data-popup", + "technique": "PDF URL link", + "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007", + }, + { + "in_doc_url": "library.wur.nl/", + "in_fulltext_url": "pdf", + "selector": "a.wl_full_text_restricted", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922", + }, + { + "in_doc_url": "/dlibra/", + "in_fulltext_url": "pdf", + "selector": "iframe#js-main-frame", + "attr": "src", + "technique": "PDF iframe (dlibra)", + "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031", + }, + { + "in_doc_url": "/handle/", + "in_fulltext_url": "pdf", + "selector": "table.misc table.inner tr.b a", + "attr": "href", + "technique": "PDF URL link (DSpace, first file)", + "example_page": "https://orbi.uliege.be/handle/2268/174200", + }, + { + "in_doc_url": "/publications/", + "in_fulltext_url": "pdf", + "selector": ".publication-sidebar li.open-access a.document-link", + "attr": "href", + "technique": "PDF URL link (Pure repo, OA link)", + "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance", + }, + { + "in_doc_url": "//hal", + "selector": ".widget-openaccess .widget-content a", + "attr": "href", + "technique": "Fulltext OA URL (HAL)", + "example_page": "https://hal.archives-ouvertes.fr/hal-00744951", + }, + { + "in_doc_url": "/record/", + "in_fulltext_url": "pdf", + "selector": "#detailedrecordminipanelfile a", + "attr": "href", + "technique": "PDF URL link (Invenio)", + "example_page": "https://bib-pubdb1.desy.de/record/416556", + }, + { + "in_doc_url": "/available/", + "in_fulltext_url": "pdf", + "selector": "table.file-table a", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/", + }, + { + "in_doc_url": "/islandora/", + "in_fulltext_url": "pdf", + "selector": "a.islandora-pdf-link", + "attr": "href", + "technique": "PDF URL link (Islandora)", + "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804", + }, + { + "in_doc_url": "/receive/", + "in_fulltext_url": "pdf", + "selector": ".mir-preview noscript a", + "attr": "href", + "technique": "PDF iframe via noscript (MyCoRe)", + "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191", + }, + { + "in_doc_url": "/registro.do", + "in_fulltext_url": "imagenes", + "selector": ".resumen_bib a[data-analytics=media]", + "attr": "href", + "technique": "Media link (DIGIBIS)", + "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740", + }, + { + "in_doc_url": "/view", + "in_fulltext_url": "/at_download/", + "selector": ".documentContent #content a", + "attr": "href", + "technique": "Media link (Plone)", + "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view", + }, + { + "in_doc_url": "isca-speech.org/", + "in_fulltext_url": "pdf", + "selector": ".w3-container a", + "attr": "href", + "technique": "PDF URL link (isca-speech.org)", + "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html", + }, + { + "in_doc_url": "://repository.dri.ie/", + "in_fulltext_url": "/download", + "selector": "#dri_download_assets > div > a", + "attr": "href", + "technique": "Download link (repository.dri.ie)", + "example_page": "https://repository.dri.ie/catalog/qf8621102", + }, + { + "in_doc_url": "frontiersin.org/", + "in_fulltext_url": "pdf", + "selector": "a.download-files-pdf", + "attr": "href", + "technique": "PDF Download link (frontiersin.org)", + "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full", + }, + { + "in_doc_url": "cureus.com/", + "in_fulltext_url": "pdf", + "selector": ".small-medium-pdf a.pdf-download-button", + "attr": "href", + "technique": "PDF Download link (cureus.com)", + "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks", + }, + { + "in_doc_url": "e-manuscripta.ch/", + "in_fulltext_url": "pdf", + "selector": "#titleinfoPdfDownload a.resourceLink", + "attr": "href", + "technique": "PDF Download link (e-manuscripta.ch)", + "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176", + }, +] + +FULLTEXT_URL_PATTERNS_SKIP = [ + # wiley has a weird almost-blank page we don't want to loop on + "://onlinelibrary.wiley.com/doi/pdf/" + "://doi.org/" + "://dx.doi.org/" +] + +RELEASE_TYPE_MAP = { + "research article": "article-journal", + "text.serial.journal": "article-journal", +} + + +class BiblioMetadata(pydantic.BaseModel): + title: Optional[str] + subtitle: Optional[str] + contrib_names: Optional[List[str]] + release_date: Optional[datetime.date] + release_year: Optional[int] + release_type: Optional[str] + release_stage: Optional[str] + withdrawn_status: Optional[str] + lang: Optional[str] + country_code: Optional[str] + volume: Optional[str] + issue: Optional[str] + number: Optional[str] + pages: Optional[str] + first_page: Optional[str] + last_page: Optional[str] + license: Optional[str] + publisher: Optional[str] + container_name: Optional[str] + container_abbrev: Optional[str] + container_issn: Optional[str] + container_type: Optional[str] + raw_references: Optional[List[str]] + + doi: Optional[str] + pmid: Optional[str] + pmcid: Optional[str] + isbn13: Optional[str] + publisher_ident: Optional[str] + oai_id: Optional[str] + + abstract: Optional[str] + pdf_fulltext_url: Optional[str] + html_fulltext_url: Optional[str] + xml_fulltext_url: Optional[str] + component_url: Optional[str] + + class Config: + json_encoders = { + datetime.date: lambda dt: dt.isoformat() + } + + +def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]: + """ + Tries to quickly extract fulltext URLs using a set of patterns. This + function is intendend to be generic across various extraction techniques. + + Returns null or a tuple of (url, technique) + """ + self_doc_url: Optional[Tuple[str, str]] = None + for pattern in patterns: + if not 'selector' in pattern: + continue + if 'in_doc_url' in pattern: + if not pattern['in_doc_url'] in doc_url: + continue + elem = doc.css_first(pattern['selector']) + if not elem: + continue + if 'attr' in pattern: + val = elem.attrs.get(pattern['attr']) + if not val: + continue + val = urllib.parse.urljoin(doc_url, val) + assert val + if 'in_fulltext_url' in pattern: + if not pattern['in_fulltext_url'] in val: + continue + for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: + if skip_pattern in val.lower(): + continue + if url_fuzzy_equal(doc_url, val): + # don't link to self, unless no other options + self_doc_url = (val, pattern.get('technique', 'unknown')) + continue + return (val, pattern.get('technique', 'unknown')) + if self_doc_url: + print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr) + return self_doc_url + return None + +def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: + + meta: Any = dict() + head = doc.css_first("head") + if not head: + return None + + for field, patterns in HEAD_META_PATTERNS.items(): + for pattern in patterns: + val = head.css_first(pattern) + #print((field, pattern, val)) + if val and 'content' in val.attrs and val.attrs['content']: + meta[field] = val.attrs['content'] + break + + for field, patterns in HEAD_META_LIST_PATTERNS.items(): + for pattern in patterns: + val_list = head.css(pattern) + if val_list: + for val in val_list: + if 'content' in val.attrs and val.attrs['content']: + if not field in meta: + meta[field] = [] + meta[field].append(val.attrs['content']) + break + + # (some) fulltext extractions + pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS) + if pdf_fulltext_url: + meta['pdf_fulltext_url'] = pdf_fulltext_url[0] + xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS) + if xml_fulltext_url: + meta['xml_fulltext_url'] = xml_fulltext_url[0] + html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS) + if html_fulltext_url: + meta['html_fulltext_url'] = html_fulltext_url[0] + component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS) + if component_url: + meta['component_url'] = component_url[0] + + # TODO: replace with clean_doi() et al + if meta.get('doi') and meta.get('doi').startswith('doi:'): + meta['doi'] = meta['doi'][4:] + + raw_identifiers = meta.pop('raw_identifiers', []) + for ident in raw_identifiers: + if ident.startswith('doi:10.'): + if not 'doi' in meta: + meta['doi'] = ident.replace('doi:', '') + elif ident.startswith('10.') and '/' in ident: + if not 'doi' in meta: + meta['doi'] = ident + elif ident.startswith('isbn:'): + if not 'isbn' in meta: + meta['isbn'] = ident.replace('isbn:', '') + + raw_date = meta.pop('raw_date', None) + if raw_date: + parsed = dateparser.parse(raw_date) + if parsed: + meta['release_date'] = parsed.date() + + raw_release_type = meta.pop('raw_release_type', None) + if raw_release_type: + release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip()) + if release_type: + meta['release_type'] = release_type + + return BiblioMetadata(**meta) + +def load_adblock_rules() -> braveblock.Adblocker: + """ + TODO: consider blocking very generic assets: + - ://fonts.googleapis.com/css* + - ://journals.plos.org/plosone/resource/img/icon.* + """ + return braveblock.Adblocker( + include_easylist=True, + include_easyprivacy=True, + rules=[ + "/favicon.ico^", + "||fonts.googleapis.com^", + "||widgets.figshare.com^", + "||crossmark-cdn.crossref.org^", + "||crossmark.crossref.org^", + "||platform.twitter.com^", + "||verify.nature.com^", + "||s7.addthis.com^", + "||www.mendeley.com^", + "||pbs.twimg.com^", + "||badge.dimensions.ai^", + "||recaptcha.net^", + + # not sure about these CC badges (usually via a redirect) + #"||licensebuttons.net^", + #"||i.creativecommons.org^", + + # Should we skip jquery, or other generic javascript CDNs? + #"||code.jquery.com^", + #"||ajax.googleapis.com^", + #"||cdnjs.cloudflare.com^", + + # badges, "share" buttons, tracking, etc + "apis.google.com/js/plusone", + "www.google.com/recaptcha/", + "js/_getUACode.js" + + # PLOS images + "/resource/img/icon.*.16.png^", + ], + ) + + +def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list: + resources = [] + + for node in doc.css(selector): + for attr in attrs: + if not attr in node.attrs: + continue + url = node.attrs.get(attr) + # special-case a couple meta URI prefixes which don't match with adblock rules + skip = False + for prefix in ['about:', 'data:', 'magnet:', 'urn:', 'mailto:']: + if url and url.startswith(prefix): + skip = True + break + if skip: + continue + if url: + #print(url, file=sys.stderr) + resources.append(dict(url=url.strip(), type=type_name)) + + return resources + + +def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list: + """ + This function tries to find all the important resources in a page. The + presumption is that the HTML document is article fulltext, and we want the + list of all resoures (by URL) necessary to replay the page. + + The returned resource URLs each have a type (script, img, css, etc), and + should be fully-qualified URLs (not relative). + + Adblock filtering is run to remove unwanted resources. + """ + resources = [] + + # select various resource references + resources += _extract_generic(doc, "script", ["src"], "script") + resources += _extract_generic(doc, "link[rel='stylesheet']", ["href"], "stylesheet") + # TODO: srcset and parse + # eg: https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-2x.jpg 1200w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-1x.jpg 600w, https://dfzljdn9uc3pi.cloudfront.net/2018/4375/1/fig-5-small.jpg 355w + resources += _extract_generic(doc, "img", ["src"], "image") + resources += _extract_generic(doc, "audio", ["src"], "audio") + resources += _extract_generic(doc, "video", ["src"], "media") + resources += _extract_generic(doc, "source", ["src"], "media") + resources += _extract_generic(doc, "track", ["src"], "media") + resources += _extract_generic(doc, "iframe", ["src"], "subdocument") + resources += _extract_generic(doc, "embed", ["src"], "media") + + # ensure URLs are absolute + for r in resources: + r['url'] = urllib.parse.urljoin(doc_url, r['url']) + + # filter using adblocker + resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False] + + # remove duplicates + resources = [dict(t) for t in {tuple(d.items()) for d in resources}] + + return resources + |