diff options
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 615 |
1 files changed, 528 insertions, 87 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index f9f48a6..1e2d197 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -1,17 +1,15 @@ - -import sys import datetime -from typing import List, Optional, Any, Tuple, Dict +import sys import urllib.parse +from typing import Any, Dict, List, Optional, Tuple +import braveblock import dateparser -from selectolax.parser import HTMLParser import pydantic -import braveblock +from selectolax.parser import HTMLParser from sandcrawler.misc import url_fuzzy_equal - # this is a map of metadata keys to CSS selectors # sources for this list include: # - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing) @@ -22,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal # order of these are mostly by preference/quality (best option first), though # also/sometimes re-ordered for lookup efficiency (lookup stops after first # match) -HEAD_META_PATTERNS: Any = { +HEAD_META_PATTERNS: Dict[str, List[str]] = { "title": [ "meta[name='citation_title']", "meta[name='eprints.title']", @@ -159,7 +157,7 @@ HEAD_META_PATTERNS: Any = { ], } -HEAD_META_LIST_PATTERNS: Any = { +HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = { "contrib_names": [ "meta[name='citation_author']", "meta[name='bepress_citation_author']", @@ -180,7 +178,7 @@ HEAD_META_LIST_PATTERNS: Any = { ], } -XML_FULLTEXT_PATTERNS: List[dict] = [ +XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "selector": "meta[name='citation_xml_url']", "attr": "content", @@ -209,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ "technique": "SciElo XML link", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "in_fulltext_url": "viewXML", "selector": "a[class='obj_galley_link']", "attr": "href", @@ -222,9 +220,17 @@ XML_FULLTEXT_PATTERNS: List[dict] = [ "technique": "ARPHA XML link", "example_page": "https://zookeys.pensoft.net/article/26391", }, + { + "in_doc_url": "frontiersin.org/", + "in_fulltext_url": "xml", + "selector": "a.download-files-nlm", + "attr": "href", + "technique": "XML (NLM) download link (frontiersin.org)", + "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full", + }, ] -HTML_FULLTEXT_PATTERNS: List[dict] = [ +HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "selector": "meta[name='citation_fulltext_html_url']", "attr": "content", @@ -249,11 +255,36 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ "attr": "href", "technique": "dovepress fulltext link", }, + { + "in_doc_url": "://doaj.org/article/", + "selector": "section.col-md-8 a[target='_blank'].button--primary", + "attr": "href", + "technique": "doaj.org access link", + }, +] + +COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ + { + "in_doc_url": "pensoft.net/article/", # also /element/ + "in_fulltext_url": "/download/fig/", + "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small", + "attr": "href", + "technique": "Active figure download link (zookeys)", + "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/", + }, + { + "in_doc_url": "/file.xhtml?persistentId", + "in_fulltext_url": "/access/datafile/", + "selector": "div.form-group code", + "use_body": "true", + "technique": "Dataverse 'download URL'", + "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0", + }, ] # This is a database of matching patterns. Most of these discovered by hand, # looking at OA journal content that failed to craw/ingest. -PDF_FULLTEXT_PATTERNS: List[dict] = [ +PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ { "selector": "head meta[name='citation_pdf_url']", "attr": "content", @@ -272,7 +303,7 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx", }, { - "selector": "head meta[propery='citation_pdf_url']", + "selector": "head meta[property='citation_pdf_url']", "attr": "content", "technique": "citation_pdf_url", # eg, researchgate @@ -300,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "selector": "a#pdfDownloadLink", "attr": "href", - "technique": "pdfDownloadLink link", + "technique": "OJS pdfDownloadLink link", "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336", }, { @@ -375,16 +406,371 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [ "technique": "PDF URL link", "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439", }, + { + "in_doc_url": "degruyter.com/document/", + "in_fulltext_url": "/pdf", + "selector": "a.downloadPdf", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html", + }, + { + "in_doc_url": "repositorio.unicamp.br/handle/", + "in_fulltext_url": "/bitstream/", + "selector": "table.panel-body a[target='_blank']", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750", + }, + { + "in_doc_url": "dlc.library.columbia.edu/durst/", + "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]", + "attr": "href", + "technique": "Access URL link", + "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9", + }, + { + "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi", + "in_fulltext_url": "pdf", + "selector": "p a[href]", + "attr": "href", + "technique": "PDF URL link", + "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29", + }, + { + "in_doc_url": "preprints.jmir.org/preprint/", + "selector": "a.pdf-download-button", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://preprints.jmir.org/preprint/22556", + }, + { + "in_doc_url": "bloomsburycollections.com/", + "in_fulltext_url": "pdf", + "selector": "li.download-item a[href]", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries", + }, + { + "in_doc_url": "emerald.com/insight/content/", + "in_fulltext_url": "pdf", + "selector": "a.intent_pdf_link", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html", + }, + { + "in_doc_url": "ingentaconnect.com/content/", + "in_fulltext_url": "pdf", + "selector": "a.pdf[data-popup]", + "attr": "data-popup", + "technique": "PDF URL link", + "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007", + }, + { + "in_doc_url": "library.wur.nl/", + "in_fulltext_url": "pdf", + "selector": "a.wl_full_text_restricted", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922", + }, + { + "in_doc_url": "/dlibra/", + "in_fulltext_url": "pdf", + "selector": "iframe#js-main-frame", + "attr": "src", + "technique": "PDF iframe (dlibra)", + "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031", + }, + { + "in_doc_url": "/handle/", + "in_fulltext_url": "pdf", + "selector": "table.misc table.inner tr.b a", + "attr": "href", + "technique": "PDF URL link (DSpace, first file)", + "example_page": "https://orbi.uliege.be/handle/2268/174200", + }, + { + "in_doc_url": "/publications/", + "in_fulltext_url": "pdf", + "selector": ".publication-sidebar li.open-access a.document-link", + "attr": "href", + "technique": "PDF URL link (Pure repo, OA link)", + "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance", + }, + { + "in_doc_url": "//hal", + "selector": ".widget-openaccess .widget-content a", + "attr": "href", + "technique": "Fulltext OA URL (HAL)", + "example_page": "https://hal.archives-ouvertes.fr/hal-00744951", + }, + { + "in_doc_url": "/record/", + "in_fulltext_url": "pdf", + "selector": "#detailedrecordminipanelfile a", + "attr": "href", + "technique": "PDF URL link (Invenio)", + "example_page": "https://bib-pubdb1.desy.de/record/416556", + }, + { + "in_doc_url": "/available/", + "in_fulltext_url": "pdf", + "selector": "table.file-table a", + "attr": "href", + "technique": "PDF URL link", + "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/", + }, + { + "in_doc_url": "/islandora/", + "in_fulltext_url": "pdf", + "selector": "a.islandora-pdf-link", + "attr": "href", + "technique": "PDF URL link (Islandora)", + "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804", + }, + { + "in_doc_url": "/receive/", + "in_fulltext_url": "pdf", + "selector": ".mir-preview noscript a", + "attr": "href", + "technique": "PDF iframe via noscript (MyCoRe)", + "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191", + }, + { + "in_doc_url": "/registro.do", + "in_fulltext_url": "imagenes", + "selector": ".resumen_bib a[data-analytics=media]", + "attr": "href", + "technique": "Media link (DIGIBIS)", + "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740", + }, + { + "in_doc_url": "/view", + "in_fulltext_url": "/at_download/", + "selector": ".documentContent #content a", + "attr": "href", + "technique": "Media link (Plone)", + "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view", + }, + { + "in_doc_url": "isca-speech.org/", + "in_fulltext_url": "pdf", + "selector": ".w3-container a", + "attr": "href", + "technique": "PDF URL link (isca-speech.org)", + "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html", + }, + { + "in_doc_url": "://repository.dri.ie/", + "in_fulltext_url": "/download", + "selector": "#dri_download_assets > div > a", + "attr": "href", + "technique": "Download link (repository.dri.ie)", + "example_page": "https://repository.dri.ie/catalog/qf8621102", + }, + { + "in_doc_url": "frontiersin.org/", + "in_fulltext_url": "pdf", + "selector": "a.download-files-pdf", + "attr": "href", + "technique": "PDF Download link (frontiersin.org)", + "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full", + }, + { + "in_doc_url": "cureus.com/", + "in_fulltext_url": "pdf", + "selector": ".small-medium-pdf a.pdf-download-button", + "attr": "href", + "technique": "PDF Download link (cureus.com)", + "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks", + }, + { + "in_doc_url": "e-manuscripta.ch/", + "in_fulltext_url": "pdf", + "selector": "#titleinfoPdfDownload a.resourceLink", + "attr": "href", + "technique": "PDF Download link (e-manuscripta.ch)", + "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176", + }, + { + "in_doc_url": "journals.uchicago.edu", + "in_fulltext_url": "pdf", + "selector": "nav.article__navbar a.ctrl--pdf", + "attr": "href", + "technique": "PDF Download link (journals.uchicago.edu)", + "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008", + }, + { + "in_doc_url": "integrityresjournals.org", + "in_fulltext_url": "/article-full-text-pdf/", + "selector": "a[target='_blank'].btn-danger", + "attr": "href", + "technique": "PDF Download link (integrityresjournals.org)", + "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", + "selector": "body.pkp_page_article a.download", + "attr": "href", + "technique": "OJS PDF Embed", + "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "/article/", + "selector": "a.pdf", + "attr": "href", + "technique": "OJS PDF link", + }, + { + "in_doc_url": "scitemed.com/article/", + "in_fulltext_url": ".pdf", + "selector": "li.tab_pdf_btn a", + "attr": "href", + "technique": "PDF link (scitemed.com)", + }, + { + "in_doc_url": "://doaj.org/article/", + "selector": "section.col-md-8 a[target='_blank'].button--primary", + "attr": "href", + "technique": "doaj.org access link", + }, + { + "in_doc_url": "/jvi.aspx", + "in_fulltext_url": "download_fulltext", + "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item", + "attr": "href", + "technique": "erciyesmedj.com publication system PDF download link", + }, + { + "selector": "body embed[alt='pdf']", + "attr": "src", + "technique": "embed PDF", + "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913", + }, + { + "in_fulltext_url": "viewPDFInterstitial", + "in_doc_url": "/view/", + "selector": "frameset frame", + "attr": "src", + "technique": "PDF iframe (viewPDFInterstitial)", + "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873", + }, + { + # note this one has a special handler + "in_doc_url": "viewPDFInterstitial", + "in_fulltext_url": "://", + "selector": "head meta[http-equiv='refresh']", + "attr": "content", + "technique": "HTML meta refresh (viewPDFInterstitial)", + "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873", + }, + { + "in_doc_url": "dlib.si/details/", + "in_fulltext_url": "PDF", + "selector": "body #FilesBox a", + "attr": "href", + "technique": "dlib.si download links", + "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ", + }, + { + "in_doc_url": "filclass.ru", + "in_fulltext_url": "pdf", + "selector": "main .pdf-article a.pdficon", + "attr": "href", + "technique": "filclass.ru PDF link", + "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism", + }, + { + "in_doc_url": "cdnsciencepub.com", + "in_fulltext_url": "pdf", + "selector": "article .info-panel a.btn--pdf", + "attr": "href", + "technique": "cdnsciencepub.com PDF link", + "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011", + }, + { + "in_doc_url": "grrjournal.com", + "in_fulltext_url": "pdf", + "selector": ".ereaders-main-section a[download]", + "attr": "href", + "technique": "grrjournal.com PDF link", + "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "pdf", + "selector": "#articleFullText a.remote_pdf", + "attr": "href", + "technique": "OJS remote_pdf link", + "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240", + }, + { + "in_doc_url": "worldscientific.com/doi/abs/", + "in_fulltext_url": "/reader/", + "selector": "article.container .single__download a", + "attr": "href", + "technique": "worldscientific landing pages", + "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098", + }, + { + "in_doc_url": "worldscientific.com/doi/", + "in_fulltext_url": "/pdf/", + "selector": "noscript a[target='_blank']", + "attr": "href", + "technique": "worldscientific reader", + "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098", + }, + { + "in_fulltext_url": "pdf", + "selector": ".container .view-content .download-article a", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one", + }, + { + "in_fulltext_url": "pdf", + "selector": "body a.download-pdf", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/view/", + "selector": "body .entry_details a.pdf", + "attr": "href", + "technique": "generic OJS/preprints", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", + "selector": "body header a.download", + "attr": "href", + "technique": "generic OJS/preprints PDF Embed", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327", + }, ] -FULLTEXT_URL_PATTERNS_SKIP = [ +FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ # wiley has a weird almost-blank page we don't want to loop on - "://onlinelibrary.wiley.com/doi/pdf/" - "://doi.org/" - "://dx.doi.org/" + "://onlinelibrary.wiley.com/doi/pdf/", + "://doi.org/", + "://dx.doi.org/", + "{'embed': '", ] -RELEASE_TYPE_MAP = { +FULLTEXT_URL_PREFIX_SKIP: List[str] = [ + "javascript:", + "about:", +] + +RELEASE_TYPE_MAP: Dict[str, str] = { "research article": "article-journal", "text.serial.journal": "article-journal", } @@ -426,14 +812,15 @@ class BiblioMetadata(pydantic.BaseModel): pdf_fulltext_url: Optional[str] html_fulltext_url: Optional[str] xml_fulltext_url: Optional[str] + component_url: Optional[str] class Config: - json_encoders = { - datetime.date: lambda dt: dt.isoformat() - } + json_encoders = {datetime.date: lambda dt: dt.isoformat()} -def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]: +def html_extract_fulltext_url( + doc_url: str, doc: HTMLParser, patterns: List[dict] +) -> Optional[Tuple[str, str]]: """ Tries to quickly extract fulltext URLs using a set of patterns. This function is intendend to be generic across various extraction techniques. @@ -442,49 +829,74 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict """ self_doc_url: Optional[Tuple[str, str]] = None for pattern in patterns: - if not 'selector' in pattern: + if "selector" not in pattern: continue - if 'in_doc_url' in pattern: - if not pattern['in_doc_url'] in doc_url: + if "in_doc_url" in pattern: + if pattern["in_doc_url"] not in doc_url: continue - elem = doc.css_first(pattern['selector']) + elem = doc.css_first(pattern["selector"]) if not elem: continue - if 'attr' in pattern: - val = elem.attrs.get(pattern['attr']) - if not val: + val = None + if "attr" in pattern: + val = elem.attrs.get(pattern["attr"]) + # handle HTML redirect + if val and pattern["attr"] == "content" and "URL=" in val: + val = val.split("URL=")[1] + elif pattern.get("use_body"): + val = elem.text() + if "://" not in val: continue - val = urllib.parse.urljoin(doc_url, val) - assert val - if 'in_fulltext_url' in pattern: - if not pattern['in_fulltext_url'] in val: - continue - for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: - if skip_pattern in val.lower(): - continue - if url_fuzzy_equal(doc_url, val): - # don't link to self, unless no other options - self_doc_url = (val, pattern.get('technique', 'unknown')) + if not val: + continue + val = urllib.parse.urljoin(doc_url, val) + assert val + if "in_fulltext_url" in pattern: + if pattern["in_fulltext_url"] not in val: continue - return (val, pattern.get('technique', 'unknown')) + skip_matched = False + for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: + if skip_pattern in val.lower(): + skip_matched = True + break + if skip_matched: + continue + for skip_pattern in FULLTEXT_URL_PREFIX_SKIP: + if val.lower().startswith(skip_pattern): + skip_matched = True + break + if skip_matched: + continue + if url_fuzzy_equal(doc_url, val): + # don't link to self, unless no other options + self_doc_url = (val, pattern.get("technique", "unknown")) + continue + + # quirks modes / hacks + if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"): + val = val[:-1] + + return (val, pattern.get("technique", "unknown")) if self_doc_url: - print(f" WARN: returning fulltext URL pointing to self", file=sys.stderr) + print(" WARN: returning fulltext URL pointing to self", file=sys.stderr) return self_doc_url return None + def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]: meta: Any = dict() head = doc.css_first("head") if not head: + print(f"WARN: empty <head>? {doc_url}", file=sys.stderr) return None for field, patterns in HEAD_META_PATTERNS.items(): for pattern in patterns: val = head.css_first(pattern) - #print((field, pattern, val)) - if val and 'content' in val.attrs and val.attrs['content']: - meta[field] = val.attrs['content'] + # print((field, pattern, val)) + if val and "content" in val.attrs and val.attrs["content"]: + meta[field] = val.attrs["content"] break for field, patterns in HEAD_META_LIST_PATTERNS.items(): @@ -492,53 +904,57 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat val_list = head.css(pattern) if val_list: for val in val_list: - if 'content' in val.attrs and val.attrs['content']: - if not field in meta: + if "content" in val.attrs and val.attrs["content"]: + if field not in meta: meta[field] = [] - meta[field].append(val.attrs['content']) + meta[field].append(val.attrs["content"]) break # (some) fulltext extractions pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS) if pdf_fulltext_url: - meta['pdf_fulltext_url'] = pdf_fulltext_url[0] + meta["pdf_fulltext_url"] = pdf_fulltext_url[0] xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS) if xml_fulltext_url: - meta['xml_fulltext_url'] = xml_fulltext_url[0] + meta["xml_fulltext_url"] = xml_fulltext_url[0] html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS) if html_fulltext_url: - meta['html_fulltext_url'] = html_fulltext_url[0] + meta["html_fulltext_url"] = html_fulltext_url[0] + component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS) + if component_url: + meta["component_url"] = component_url[0] # TODO: replace with clean_doi() et al - if meta.get('doi') and meta.get('doi').startswith('doi:'): - meta['doi'] = meta['doi'][4:] + if meta.get("doi") and meta.get("doi").startswith("doi:"): + meta["doi"] = meta["doi"][4:] - raw_identifiers = meta.pop('raw_identifiers', []) + raw_identifiers = meta.pop("raw_identifiers", []) for ident in raw_identifiers: - if ident.startswith('doi:10.'): - if not 'doi' in meta: - meta['doi'] = ident.replace('doi:', '') - elif ident.startswith('10.') and '/' in ident: - if not 'doi' in meta: - meta['doi'] = ident - elif ident.startswith('isbn:'): - if not 'isbn' in meta: - meta['isbn'] = ident.replace('isbn:', '') - - raw_date = meta.pop('raw_date', None) + if ident.startswith("doi:10."): + if "doi" not in meta: + meta["doi"] = ident.replace("doi:", "") + elif ident.startswith("10.") and "/" in ident: + if "doi" not in meta: + meta["doi"] = ident + elif ident.startswith("isbn:"): + if "isbn" not in meta: + meta["isbn"] = ident.replace("isbn:", "") + + raw_date = meta.pop("raw_date", None) if raw_date: parsed = dateparser.parse(raw_date) if parsed: - meta['release_date'] = parsed.date() + meta["release_date"] = parsed.date() - raw_release_type = meta.pop('raw_release_type', None) + raw_release_type = meta.pop("raw_release_type", None) if raw_release_type: release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip()) if release_type: - meta['release_type'] = release_type + meta["release_type"] = release_type return BiblioMetadata(**meta) + def load_adblock_rules() -> braveblock.Adblocker: """ TODO: consider blocking very generic assets: @@ -561,46 +977,67 @@ def load_adblock_rules() -> braveblock.Adblocker: "||pbs.twimg.com^", "||badge.dimensions.ai^", "||recaptcha.net^", - + "||tag.imagino.com^", + "||consent.cookiebot.com^", + "||recaptcha.net^", # not sure about these CC badges (usually via a redirect) - #"||licensebuttons.net^", - #"||i.creativecommons.org^", - + # "||licensebuttons.net^", + # "||i.creativecommons.org^", # Should we skip jquery, or other generic javascript CDNs? - #"||code.jquery.com^", - #"||ajax.googleapis.com^", - #"||cdnjs.cloudflare.com^", - + # "||code.jquery.com^", + # "||ajax.googleapis.com^", + # "||cdnjs.cloudflare.com^", # badges, "share" buttons, tracking, etc "apis.google.com/js/plusone", "www.google.com/recaptcha/", "js/_getUACode.js" - # PLOS images "/resource/img/icon.*.16.png^", + # CAIRN broken tracking tag + "cairn-int.info//about.php?cairn_guest=", ], ) -def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list: +def _extract_generic( + doc: HTMLParser, selector: str, attrs: List[str], type_name: str +) -> List[Dict[str, str]]: resources = [] for node in doc.css(selector): for attr in attrs: - if not attr in node.attrs: + if attr not in node.attrs: continue url = node.attrs.get(attr) + # special-case a couple meta URI prefixes which don't match with adblock rules + skip = False + for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]: + if url and url.startswith(prefix): + skip = True + break + if url and "/" not in url and "." not in url and " " in url: + # eg: "Ce fichier n'existe pas" + skip = True + if skip: + continue + if url and url.startswith("https://https://"): + url = url[8:] + elif url and url.startswith("http://http://"): + url = url[7:] if url: - resources.append(dict(url=url, type=type_name)) + # print(url, file=sys.stderr) + resources.append(dict(url=url.strip(), type=type_name)) return resources -def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list: +def html_extract_resources( + doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker +) -> List[Dict[str, str]]: """ This function tries to find all the important resources in a page. The presumption is that the HTML document is article fulltext, and we want the - list of all resoures (by URL) necessary to replay the page. + list of all resources (by URL) necessary to replay the page. The returned resource URLs each have a type (script, img, css, etc), and should be fully-qualified URLs (not relative). @@ -624,13 +1061,17 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Ad # ensure URLs are absolute for r in resources: - r['url'] = urllib.parse.urljoin(doc_url, r['url']) + r["url"] = urllib.parse.urljoin(doc_url, r["url"]) # filter using adblocker - resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False] + resources = [ + r + for r in resources + if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"]) + is False + ] # remove duplicates resources = [dict(t) for t in {tuple(d.items()) for d in resources}] return resources - |