diff options
Diffstat (limited to 'python/sandcrawler/html_metadata.py')
-rw-r--r-- | python/sandcrawler/html_metadata.py | 210 |
1 files changed, 202 insertions, 8 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index c46788e..1e2d197 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -207,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "SciElo XML link", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "in_fulltext_url": "viewXML", "selector": "a[class='obj_galley_link']", "attr": "href", @@ -255,6 +255,12 @@ HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "attr": "href", "technique": "dovepress fulltext link", }, + { + "in_doc_url": "://doaj.org/article/", + "selector": "section.col-md-8 a[target='_blank'].button--primary", + "attr": "href", + "technique": "doaj.org access link", + }, ] COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ @@ -325,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379", }, { - "in_doc_url": "/article/view/", + "in_doc_url": "/view/", "selector": "a#pdfDownloadLink", "attr": "href", - "technique": "pdfDownloadLink link", + "technique": "OJS pdfDownloadLink link", "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336", }, { @@ -597,13 +603,171 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [ "technique": "PDF Download link (journals.uchicago.edu)", "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008", }, + { + "in_doc_url": "integrityresjournals.org", + "in_fulltext_url": "/article-full-text-pdf/", + "selector": "a[target='_blank'].btn-danger", + "attr": "href", + "technique": "PDF Download link (integrityresjournals.org)", + "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", + "selector": "body.pkp_page_article a.download", + "attr": "href", + "technique": "OJS PDF Embed", + "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957", + }, + { + "in_doc_url": "/article/view/", + "in_fulltext_url": "/article/", + "selector": "a.pdf", + "attr": "href", + "technique": "OJS PDF link", + }, + { + "in_doc_url": "scitemed.com/article/", + "in_fulltext_url": ".pdf", + "selector": "li.tab_pdf_btn a", + "attr": "href", + "technique": "PDF link (scitemed.com)", + }, + { + "in_doc_url": "://doaj.org/article/", + "selector": "section.col-md-8 a[target='_blank'].button--primary", + "attr": "href", + "technique": "doaj.org access link", + }, + { + "in_doc_url": "/jvi.aspx", + "in_fulltext_url": "download_fulltext", + "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item", + "attr": "href", + "technique": "erciyesmedj.com publication system PDF download link", + }, + { + "selector": "body embed[alt='pdf']", + "attr": "src", + "technique": "embed PDF", + "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913", + }, + { + "in_fulltext_url": "viewPDFInterstitial", + "in_doc_url": "/view/", + "selector": "frameset frame", + "attr": "src", + "technique": "PDF iframe (viewPDFInterstitial)", + "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873", + }, + { + # note this one has a special handler + "in_doc_url": "viewPDFInterstitial", + "in_fulltext_url": "://", + "selector": "head meta[http-equiv='refresh']", + "attr": "content", + "technique": "HTML meta refresh (viewPDFInterstitial)", + "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873", + }, + { + "in_doc_url": "dlib.si/details/", + "in_fulltext_url": "PDF", + "selector": "body #FilesBox a", + "attr": "href", + "technique": "dlib.si download links", + "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ", + }, + { + "in_doc_url": "filclass.ru", + "in_fulltext_url": "pdf", + "selector": "main .pdf-article a.pdficon", + "attr": "href", + "technique": "filclass.ru PDF link", + "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism", + }, + { + "in_doc_url": "cdnsciencepub.com", + "in_fulltext_url": "pdf", + "selector": "article .info-panel a.btn--pdf", + "attr": "href", + "technique": "cdnsciencepub.com PDF link", + "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011", + }, + { + "in_doc_url": "grrjournal.com", + "in_fulltext_url": "pdf", + "selector": ".ereaders-main-section a[download]", + "attr": "href", + "technique": "grrjournal.com PDF link", + "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "pdf", + "selector": "#articleFullText a.remote_pdf", + "attr": "href", + "technique": "OJS remote_pdf link", + "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240", + }, + { + "in_doc_url": "worldscientific.com/doi/abs/", + "in_fulltext_url": "/reader/", + "selector": "article.container .single__download a", + "attr": "href", + "technique": "worldscientific landing pages", + "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098", + }, + { + "in_doc_url": "worldscientific.com/doi/", + "in_fulltext_url": "/pdf/", + "selector": "noscript a[target='_blank']", + "attr": "href", + "technique": "worldscientific reader", + "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098", + }, + { + "in_fulltext_url": "pdf", + "selector": ".container .view-content .download-article a", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one", + }, + { + "in_fulltext_url": "pdf", + "selector": "body a.download-pdf", + "attr": "href", + "technique": "generic download article button", + "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/view/", + "selector": "body .entry_details a.pdf", + "attr": "href", + "technique": "generic OJS/preprints", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022", + }, + { + "in_doc_url": "/view/", + "in_fulltext_url": "/download/", + "selector": "body header a.download", + "attr": "href", + "technique": "generic OJS/preprints PDF Embed", + "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327", + }, ] FULLTEXT_URL_PATTERNS_SKIP: List[str] = [ # wiley has a weird almost-blank page we don't want to loop on - "://onlinelibrary.wiley.com/doi/pdf/" - "://doi.org/" - "://dx.doi.org/" + "://onlinelibrary.wiley.com/doi/pdf/", + "://doi.org/", + "://dx.doi.org/", + "{'embed': '", +] + +FULLTEXT_URL_PREFIX_SKIP: List[str] = [ + "javascript:", + "about:", ] RELEASE_TYPE_MAP: Dict[str, str] = { @@ -676,6 +840,9 @@ def html_extract_fulltext_url( val = None if "attr" in pattern: val = elem.attrs.get(pattern["attr"]) + # handle HTML redirect + if val and pattern["attr"] == "content" and "URL=" in val: + val = val.split("URL=")[1] elif pattern.get("use_body"): val = elem.text() if "://" not in val: @@ -687,13 +854,28 @@ def html_extract_fulltext_url( if "in_fulltext_url" in pattern: if pattern["in_fulltext_url"] not in val: continue + skip_matched = False for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP: if skip_pattern in val.lower(): - continue + skip_matched = True + break + if skip_matched: + continue + for skip_pattern in FULLTEXT_URL_PREFIX_SKIP: + if val.lower().startswith(skip_pattern): + skip_matched = True + break + if skip_matched: + continue if url_fuzzy_equal(doc_url, val): # don't link to self, unless no other options self_doc_url = (val, pattern.get("technique", "unknown")) continue + + # quirks modes / hacks + if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"): + val = val[:-1] + return (val, pattern.get("technique", "unknown")) if self_doc_url: print(" WARN: returning fulltext URL pointing to self", file=sys.stderr) @@ -795,6 +977,9 @@ def load_adblock_rules() -> braveblock.Adblocker: "||pbs.twimg.com^", "||badge.dimensions.ai^", "||recaptcha.net^", + "||tag.imagino.com^", + "||consent.cookiebot.com^", + "||recaptcha.net^", # not sure about these CC badges (usually via a redirect) # "||licensebuttons.net^", # "||i.creativecommons.org^", @@ -808,6 +993,8 @@ def load_adblock_rules() -> braveblock.Adblocker: "js/_getUACode.js" # PLOS images "/resource/img/icon.*.16.png^", + # CAIRN broken tracking tag + "cairn-int.info//about.php?cairn_guest=", ], ) @@ -824,12 +1011,19 @@ def _extract_generic( url = node.attrs.get(attr) # special-case a couple meta URI prefixes which don't match with adblock rules skip = False - for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:"]: + for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]: if url and url.startswith(prefix): skip = True break + if url and "/" not in url and "." not in url and " " in url: + # eg: "Ce fichier n'existe pas" + skip = True if skip: continue + if url and url.startswith("https://https://"): + url = url[8:] + elif url and url.startswith("http://http://"): + url = url[7:] if url: # print(url, file=sys.stderr) resources.append(dict(url=url.strip(), type=type_name)) |