1 files changed, 528 insertions, 87 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index f9f48a6..1e2d197 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -1,17 +1,15 @@
-
-import sys
 import datetime
-from typing import List, Optional, Any, Tuple, Dict
+import sys
 import urllib.parse
+from typing import Any, Dict, List, Optional, Tuple
 
+import braveblock
 import dateparser
-from selectolax.parser import HTMLParser
 import pydantic
-import braveblock
+from selectolax.parser import HTMLParser
 
 from sandcrawler.misc import url_fuzzy_equal
 
-
 # this is a map of metadata keys to CSS selectors
 # sources for this list include:
 #  - google scholar crawling notes (https://scholar.google.com/intl/ja/scholar/inclusion.html#indexing)
@@ -22,7 +20,7 @@ from sandcrawler.misc import url_fuzzy_equal
 # order of these are mostly by preference/quality (best option first), though
 # also/sometimes re-ordered for lookup efficiency (lookup stops after first
 # match)
-HEAD_META_PATTERNS: Any = {
+HEAD_META_PATTERNS: Dict[str, List[str]] = {
     "title": [
         "meta[name='citation_title']",
         "meta[name='eprints.title']",
@@ -159,7 +157,7 @@ HEAD_META_PATTERNS: Any = {
     ],
 }
 
-HEAD_META_LIST_PATTERNS: Any = {
+HEAD_META_LIST_PATTERNS: Dict[str, List[str]] = {
     "contrib_names": [
         "meta[name='citation_author']",
         "meta[name='bepress_citation_author']",
@@ -180,7 +178,7 @@ HEAD_META_LIST_PATTERNS: Any = {
     ],
 }
 
-XML_FULLTEXT_PATTERNS: List[dict] = [
+XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "selector": "meta[name='citation_xml_url']",
         "attr": "content",
@@ -209,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
         "technique": "SciElo XML link",
     },
     {
-        "in_doc_url": "/article/view/",
+        "in_doc_url": "/view/",
         "in_fulltext_url": "viewXML",
         "selector": "a[class='obj_galley_link']",
         "attr": "href",
@@ -222,9 +220,17 @@ XML_FULLTEXT_PATTERNS: List[dict] = [
         "technique": "ARPHA XML link",
         "example_page": "https://zookeys.pensoft.net/article/26391",
     },
+    {
+        "in_doc_url": "frontiersin.org/",
+        "in_fulltext_url": "xml",
+        "selector": "a.download-files-nlm",
+        "attr": "href",
+        "technique": "XML (NLM) download link (frontiersin.org)",
+        "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+    },
 ]
 
-HTML_FULLTEXT_PATTERNS: List[dict] = [
+HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "selector": "meta[name='citation_fulltext_html_url']",
         "attr": "content",
@@ -249,11 +255,36 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
         "attr": "href",
         "technique": "dovepress fulltext link",
     },
+    {
+        "in_doc_url": "://doaj.org/article/",
+        "selector": "section.col-md-8 a[target='_blank'].button--primary",
+        "attr": "href",
+        "technique": "doaj.org access link",
+    },
+]
+
+COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
+    {
+        "in_doc_url": "pensoft.net/article/",  # also /element/
+        "in_fulltext_url": "/download/fig/",
+        "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+        "attr": "href",
+        "technique": "Active figure download link (zookeys)",
+        "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+    },
+    {
+        "in_doc_url": "/file.xhtml?persistentId",
+        "in_fulltext_url": "/access/datafile/",
+        "selector": "div.form-group code",
+        "use_body": "true",
+        "technique": "Dataverse 'download URL'",
+        "example_page": "https://data.lipi.go.id/file.xhtml?persistentId=hdl:20.500.12690/RIN/IDDOAH/BTNH25&version=1.0",
+    },
 ]
 
 # This is a database of matching patterns. Most of these discovered by hand,
 # looking at OA journal content that failed to craw/ingest.
-PDF_FULLTEXT_PATTERNS: List[dict] = [
+PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
     {
         "selector": "head meta[name='citation_pdf_url']",
         "attr": "content",
@@ -272,7 +303,7 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
         "example_page": "https://journals.lww.com/otainternational/Fulltext/2019/03011/Trauma_systems_in_North_America.2.aspx",
     },
     {
-        "selector": "head meta[propery='citation_pdf_url']",
+        "selector": "head meta[property='citation_pdf_url']",
         "attr": "content",
         "technique": "citation_pdf_url",
         # eg, researchgate
@@ -300,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
         "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
     },
     {
-        "in_doc_url": "/article/view/",
+        "in_doc_url": "/view/",
         "selector": "a#pdfDownloadLink",
         "attr": "href",
-        "technique": "pdfDownloadLink link",
+        "technique": "OJS pdfDownloadLink link",
         "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
     },
     {
@@ -375,16 +406,371 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
         "technique": "PDF URL link",
         "example_page": "http://www.bezmialemscience.org/archives/archive-detail/article-preview/editorial/20439",
     },
+    {
+        "in_doc_url": "degruyter.com/document/",
+        "in_fulltext_url": "/pdf",
+        "selector": "a.downloadPdf",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
+    },
+    {
+        "in_doc_url": "repositorio.unicamp.br/handle/",
+        "in_fulltext_url": "/bitstream/",
+        "selector": "table.panel-body a[target='_blank']",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+    },
+    {
+        "in_doc_url": "dlc.library.columbia.edu/durst/",
+        "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+        "attr": "href",
+        "technique": "Access URL link",
+        "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+    },
+    {
+        "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+        "in_fulltext_url": "pdf",
+        "selector": "p a[href]",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+    },
+    {
+        "in_doc_url": "preprints.jmir.org/preprint/",
+        "selector": "a.pdf-download-button",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://preprints.jmir.org/preprint/22556",
+    },
+    {
+        "in_doc_url": "bloomsburycollections.com/",
+        "in_fulltext_url": "pdf",
+        "selector": "li.download-item a[href]",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+    },
+    {
+        "in_doc_url": "emerald.com/insight/content/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.intent_pdf_link",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+    },
+    {
+        "in_doc_url": "ingentaconnect.com/content/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.pdf[data-popup]",
+        "attr": "data-popup",
+        "technique": "PDF URL link",
+        "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+    },
+    {
+        "in_doc_url": "library.wur.nl/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.wl_full_text_restricted",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://library.wur.nl/WebQuery/wurpubs/529922",
+    },
+    {
+        "in_doc_url": "/dlibra/",
+        "in_fulltext_url": "pdf",
+        "selector": "iframe#js-main-frame",
+        "attr": "src",
+        "technique": "PDF iframe (dlibra)",
+        "example_page": "https://dbc.wroc.pl/dlibra/docmetadata?showContent=true&id=41031",
+    },
+    {
+        "in_doc_url": "/handle/",
+        "in_fulltext_url": "pdf",
+        "selector": "table.misc table.inner tr.b a",
+        "attr": "href",
+        "technique": "PDF URL link (DSpace, first file)",
+        "example_page": "https://orbi.uliege.be/handle/2268/174200",
+    },
+    {
+        "in_doc_url": "/publications/",
+        "in_fulltext_url": "pdf",
+        "selector": ".publication-sidebar li.open-access a.document-link",
+        "attr": "href",
+        "technique": "PDF URL link (Pure repo, OA link)",
+        "example_page": "https://research.tue.nl/en/publications/lowering-the-threshold-for-computers-in-early-design-some-advance",
+    },
+    {
+        "in_doc_url": "//hal",
+        "selector": ".widget-openaccess .widget-content a",
+        "attr": "href",
+        "technique": "Fulltext OA URL (HAL)",
+        "example_page": "https://hal.archives-ouvertes.fr/hal-00744951",
+    },
+    {
+        "in_doc_url": "/record/",
+        "in_fulltext_url": "pdf",
+        "selector": "#detailedrecordminipanelfile a",
+        "attr": "href",
+        "technique": "PDF URL link (Invenio)",
+        "example_page": "https://bib-pubdb1.desy.de/record/416556",
+    },
+    {
+        "in_doc_url": "/available/",
+        "in_fulltext_url": "pdf",
+        "selector": "table.file-table a",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://etd.adm.unipi.it/theses/available/etd-05302014-183910/",
+    },
+    {
+        "in_doc_url": "/islandora/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.islandora-pdf-link",
+        "attr": "href",
+        "technique": "PDF URL link (Islandora)",
+        "example_page": "http://fau.digital.flvc.org/islandora/object/fau%3A9804",
+    },
+    {
+        "in_doc_url": "/receive/",
+        "in_fulltext_url": "pdf",
+        "selector": ".mir-preview noscript a",
+        "attr": "href",
+        "technique": "PDF iframe via noscript (MyCoRe)",
+        "example_page": "https://www.db-thueringen.de/receive/dbt_mods_00005191",
+    },
+    {
+        "in_doc_url": "/registro.do",
+        "in_fulltext_url": "imagenes",
+        "selector": ".resumen_bib a[data-analytics=media]",
+        "attr": "href",
+        "technique": "Media link (DIGIBIS)",
+        "example_page": "https://bivaldi.gva.es/es/consulta/registro.do?id=11740",
+    },
+    {
+        "in_doc_url": "/view",
+        "in_fulltext_url": "/at_download/",
+        "selector": ".documentContent #content a",
+        "attr": "href",
+        "technique": "Media link (Plone)",
+        "example_page": "http://xjornadaslc.fahce.unlp.edu.ar/actas/Ramon_Esteban_Chaparro.pdf/view",
+    },
+    {
+        "in_doc_url": "isca-speech.org/",
+        "in_fulltext_url": "pdf",
+        "selector": ".w3-container a",
+        "attr": "href",
+        "technique": "PDF URL link (isca-speech.org)",
+        "example_page": "https://www.isca-speech.org/archive/interspeech_2006/chitturi06b_interspeech.html",
+    },
+    {
+        "in_doc_url": "://repository.dri.ie/",
+        "in_fulltext_url": "/download",
+        "selector": "#dri_download_assets > div > a",
+        "attr": "href",
+        "technique": "Download link (repository.dri.ie)",
+        "example_page": "https://repository.dri.ie/catalog/qf8621102",
+    },
+    {
+        "in_doc_url": "frontiersin.org/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.download-files-pdf",
+        "attr": "href",
+        "technique": "PDF Download link (frontiersin.org)",
+        "example_page": "https://www.frontiersin.org/articles/10.3389/fnins.2021.722592/full",
+    },
+    {
+        "in_doc_url": "cureus.com/",
+        "in_fulltext_url": "pdf",
+        "selector": ".small-medium-pdf a.pdf-download-button",
+        "attr": "href",
+        "technique": "PDF Download link (cureus.com)",
+        "example_page": "https://www.cureus.com/articles/69542-tramadol-induced-jerks",
+    },
+    {
+        "in_doc_url": "e-manuscripta.ch/",
+        "in_fulltext_url": "pdf",
+        "selector": "#titleinfoPdfDownload a.resourceLink",
+        "attr": "href",
+        "technique": "PDF Download link (e-manuscripta.ch)",
+        "example_page": "https://www.e-manuscripta.ch/zut/doi/10.7891/e-manuscripta-112176",
+    },
+    {
+        "in_doc_url": "journals.uchicago.edu",
+        "in_fulltext_url": "pdf",
+        "selector": "nav.article__navbar a.ctrl--pdf",
+        "attr": "href",
+        "technique": "PDF Download link (journals.uchicago.edu)",
+        "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
+    },
+    {
+        "in_doc_url": "integrityresjournals.org",
+        "in_fulltext_url": "/article-full-text-pdf/",
+        "selector": "a[target='_blank'].btn-danger",
+        "attr": "href",
+        "technique": "PDF Download link (integrityresjournals.org)",
+        "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "/download/",
+        "selector": "body.pkp_page_article a.download",
+        "attr": "href",
+        "technique": "OJS PDF Embed",
+        "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+    },
+    {
+        "in_doc_url": "/article/view/",
+        "in_fulltext_url": "/article/",
+        "selector": "a.pdf",
+        "attr": "href",
+        "technique": "OJS PDF link",
+    },
+    {
+        "in_doc_url": "scitemed.com/article/",
+        "in_fulltext_url": ".pdf",
+        "selector": "li.tab_pdf_btn a",
+        "attr": "href",
+        "technique": "PDF link (scitemed.com)",
+    },
+    {
+        "in_doc_url": "://doaj.org/article/",
+        "selector": "section.col-md-8 a[target='_blank'].button--primary",
+        "attr": "href",
+        "technique": "doaj.org access link",
+    },
+    {
+        "in_doc_url": "/jvi.aspx",
+        "in_fulltext_url": "download_fulltext",
+        "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+        "attr": "href",
+        "technique": "erciyesmedj.com publication system PDF download link",
+    },
+    {
+        "selector": "body embed[alt='pdf']",
+        "attr": "src",
+        "technique": "embed PDF",
+        "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+    },
+    {
+        "in_fulltext_url": "viewPDFInterstitial",
+        "in_doc_url": "/view/",
+        "selector": "frameset frame",
+        "attr": "src",
+        "technique": "PDF iframe (viewPDFInterstitial)",
+        "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+    },
+    {
+        # note this one has a special handler
+        "in_doc_url": "viewPDFInterstitial",
+        "in_fulltext_url": "://",
+        "selector": "head meta[http-equiv='refresh']",
+        "attr": "content",
+        "technique": "HTML meta refresh (viewPDFInterstitial)",
+        "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+    },
+    {
+        "in_doc_url": "dlib.si/details/",
+        "in_fulltext_url": "PDF",
+        "selector": "body #FilesBox a",
+        "attr": "href",
+        "technique": "dlib.si download links",
+        "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+    },
+    {
+        "in_doc_url": "filclass.ru",
+        "in_fulltext_url": "pdf",
+        "selector": "main .pdf-article a.pdficon",
+        "attr": "href",
+        "technique": "filclass.ru PDF link",
+        "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+    },
+    {
+        "in_doc_url": "cdnsciencepub.com",
+        "in_fulltext_url": "pdf",
+        "selector": "article .info-panel a.btn--pdf",
+        "attr": "href",
+        "technique": "cdnsciencepub.com PDF link",
+        "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+    },
+    {
+        "in_doc_url": "grrjournal.com",
+        "in_fulltext_url": "pdf",
+        "selector": ".ereaders-main-section a[download]",
+        "attr": "href",
+        "technique": "grrjournal.com PDF link",
+        "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "pdf",
+        "selector": "#articleFullText a.remote_pdf",
+        "attr": "href",
+        "technique": "OJS remote_pdf link",
+        "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+    },
+    {
+        "in_doc_url": "worldscientific.com/doi/abs/",
+        "in_fulltext_url": "/reader/",
+        "selector": "article.container .single__download a",
+        "attr": "href",
+        "technique": "worldscientific landing pages",
+        "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+    },
+    {
+        "in_doc_url": "worldscientific.com/doi/",
+        "in_fulltext_url": "/pdf/",
+        "selector": "noscript a[target='_blank']",
+        "attr": "href",
+        "technique": "worldscientific reader",
+        "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+    },
+    {
+        "in_fulltext_url": "pdf",
+        "selector": ".container .view-content .download-article a",
+        "attr": "href",
+        "technique": "generic download article button",
+        "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+    },
+    {
+        "in_fulltext_url": "pdf",
+        "selector": "body a.download-pdf",
+        "attr": "href",
+        "technique": "generic download article button",
+        "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "/view/",
+        "selector": "body .entry_details a.pdf",
+        "attr": "href",
+        "technique": "generic OJS/preprints",
+        "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "/download/",
+        "selector": "body header a.download",
+        "attr": "href",
+        "technique": "generic OJS/preprints PDF Embed",
+        "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+    },
 ]
 
-FULLTEXT_URL_PATTERNS_SKIP = [
+FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
     # wiley has a weird almost-blank page we don't want to loop on
-    "://onlinelibrary.wiley.com/doi/pdf/"
-    "://doi.org/"
-    "://dx.doi.org/"
+    "://onlinelibrary.wiley.com/doi/pdf/",
+    "://doi.org/",
+    "://dx.doi.org/",
+    "{'embed': '",
 ]
 
-RELEASE_TYPE_MAP = {
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+    "javascript:",
+    "about:",
+]
+
+RELEASE_TYPE_MAP: Dict[str, str] = {
     "research article": "article-journal",
     "text.serial.journal": "article-journal",
 }
@@ -426,14 +812,15 @@ class BiblioMetadata(pydantic.BaseModel):
     pdf_fulltext_url: Optional[str]
     html_fulltext_url: Optional[str]
     xml_fulltext_url: Optional[str]
+    component_url: Optional[str]
 
     class Config:
-        json_encoders = {
-            datetime.date: lambda dt: dt.isoformat()
-        }
+        json_encoders = {datetime.date: lambda dt: dt.isoformat()}
 
 
-def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict]) -> Optional[Tuple[str, str]]:
+def html_extract_fulltext_url(
+    doc_url: str, doc: HTMLParser, patterns: List[dict]
+) -> Optional[Tuple[str, str]]:
     """
     Tries to quickly extract fulltext URLs using a set of patterns. This
     function is intendend to be generic across various extraction techniques.
@@ -442,49 +829,74 @@ def html_extract_fulltext_url(doc_url: str, doc: HTMLParser, patterns: List[dict
     """
     self_doc_url: Optional[Tuple[str, str]] = None
     for pattern in patterns:
-        if not 'selector' in pattern:
+        if "selector" not in pattern:
             continue
-        if 'in_doc_url' in pattern:
-            if not pattern['in_doc_url'] in doc_url:
+        if "in_doc_url" in pattern:
+            if pattern["in_doc_url"] not in doc_url:
                 continue
-        elem = doc.css_first(pattern['selector'])
+        elem = doc.css_first(pattern["selector"])
         if not elem:
             continue
-        if 'attr' in pattern:
-            val = elem.attrs.get(pattern['attr'])
-            if not val:
+        val = None
+        if "attr" in pattern:
+            val = elem.attrs.get(pattern["attr"])
+            # handle HTML redirect
+            if val and pattern["attr"] == "content" and "URL=" in val:
+                val = val.split("URL=")[1]
+        elif pattern.get("use_body"):
+            val = elem.text()
+            if "://" not in val:
                 continue
-            val = urllib.parse.urljoin(doc_url, val)
-            assert val
-            if 'in_fulltext_url' in pattern:
-                if not pattern['in_fulltext_url'] in val:
-                    continue
-            for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
-                if skip_pattern in val.lower():
-                    continue
-            if url_fuzzy_equal(doc_url, val):
-                # don't link to self, unless no other options
-                self_doc_url = (val, pattern.get('technique', 'unknown'))
+        if not val:
+            continue
+        val = urllib.parse.urljoin(doc_url, val)
+        assert val
+        if "in_fulltext_url" in pattern:
+            if pattern["in_fulltext_url"] not in val:
                 continue
-            return (val, pattern.get('technique', 'unknown'))
+        skip_matched = False
+        for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
+            if skip_pattern in val.lower():
+                skip_matched = True
+                break
+        if skip_matched:
+            continue
+        for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+            if val.lower().startswith(skip_pattern):
+                skip_matched = True
+                break
+        if skip_matched:
+            continue
+        if url_fuzzy_equal(doc_url, val):
+            # don't link to self, unless no other options
+            self_doc_url = (val, pattern.get("technique", "unknown"))
+            continue
+
+        # quirks modes / hacks
+        if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+            val = val[:-1]
+
+        return (val, pattern.get("technique", "unknown"))
     if self_doc_url:
-        print(f"  WARN: returning fulltext URL pointing to self", file=sys.stderr)
+        print("  WARN: returning fulltext URL pointing to self", file=sys.stderr)
         return self_doc_url
     return None
 
+
 def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadata]:
 
     meta: Any = dict()
     head = doc.css_first("head")
     if not head:
+        print(f"WARN: empty <head>? {doc_url}", file=sys.stderr)
         return None
 
     for field, patterns in HEAD_META_PATTERNS.items():
         for pattern in patterns:
             val = head.css_first(pattern)
-            #print((field, pattern, val))
-            if val and 'content' in val.attrs and val.attrs['content']:
-                meta[field] = val.attrs['content']
+            # print((field, pattern, val))
+            if val and "content" in val.attrs and val.attrs["content"]:
+                meta[field] = val.attrs["content"]
                 break
 
     for field, patterns in HEAD_META_LIST_PATTERNS.items():
@@ -492,53 +904,57 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
             val_list = head.css(pattern)
             if val_list:
                 for val in val_list:
-                    if 'content' in val.attrs and val.attrs['content']:
-                        if not field in meta:
+                    if "content" in val.attrs and val.attrs["content"]:
+                        if field not in meta:
                             meta[field] = []
-                        meta[field].append(val.attrs['content'])
+                        meta[field].append(val.attrs["content"])
                 break
 
     # (some) fulltext extractions
     pdf_fulltext_url = html_extract_fulltext_url(doc_url, doc, PDF_FULLTEXT_PATTERNS)
     if pdf_fulltext_url:
-        meta['pdf_fulltext_url'] = pdf_fulltext_url[0]
+        meta["pdf_fulltext_url"] = pdf_fulltext_url[0]
     xml_fulltext_url = html_extract_fulltext_url(doc_url, doc, XML_FULLTEXT_PATTERNS)
     if xml_fulltext_url:
-        meta['xml_fulltext_url'] = xml_fulltext_url[0]
+        meta["xml_fulltext_url"] = xml_fulltext_url[0]
     html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
     if html_fulltext_url:
-        meta['html_fulltext_url'] = html_fulltext_url[0]
+        meta["html_fulltext_url"] = html_fulltext_url[0]
+    component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+    if component_url:
+        meta["component_url"] = component_url[0]
 
     # TODO: replace with clean_doi() et al
-    if meta.get('doi') and meta.get('doi').startswith('doi:'):
-        meta['doi'] = meta['doi'][4:]
+    if meta.get("doi") and meta.get("doi").startswith("doi:"):
+        meta["doi"] = meta["doi"][4:]
 
-    raw_identifiers = meta.pop('raw_identifiers', [])
+    raw_identifiers = meta.pop("raw_identifiers", [])
     for ident in raw_identifiers:
-        if ident.startswith('doi:10.'):
-            if not 'doi' in meta:
-                meta['doi'] = ident.replace('doi:', '')
-        elif ident.startswith('10.') and '/' in ident:
-            if not 'doi' in meta:
-                meta['doi'] = ident
-        elif ident.startswith('isbn:'):
-            if not 'isbn' in meta:
-                meta['isbn'] = ident.replace('isbn:', '')
-
-    raw_date = meta.pop('raw_date', None)
+        if ident.startswith("doi:10."):
+            if "doi" not in meta:
+                meta["doi"] = ident.replace("doi:", "")
+        elif ident.startswith("10.") and "/" in ident:
+            if "doi" not in meta:
+                meta["doi"] = ident
+        elif ident.startswith("isbn:"):
+            if "isbn" not in meta:
+                meta["isbn"] = ident.replace("isbn:", "")
+
+    raw_date = meta.pop("raw_date", None)
     if raw_date:
         parsed = dateparser.parse(raw_date)
         if parsed:
-            meta['release_date'] = parsed.date()
+            meta["release_date"] = parsed.date()
 
-    raw_release_type = meta.pop('raw_release_type', None)
+    raw_release_type = meta.pop("raw_release_type", None)
     if raw_release_type:
         release_type = RELEASE_TYPE_MAP.get(raw_release_type.lower().strip())
         if release_type:
-            meta['release_type'] = release_type
+            meta["release_type"] = release_type
 
     return BiblioMetadata(**meta)
 
+
 def load_adblock_rules() -> braveblock.Adblocker:
     """
     TODO: consider blocking very generic assets:
@@ -561,46 +977,67 @@ def load_adblock_rules() -> braveblock.Adblocker:
             "||pbs.twimg.com^",
             "||badge.dimensions.ai^",
             "||recaptcha.net^",
-
+            "||tag.imagino.com^",
+            "||consent.cookiebot.com^",
+            "||recaptcha.net^",
             # not sure about these CC badges (usually via a redirect)
-            #"||licensebuttons.net^",
-            #"||i.creativecommons.org^",
-
+            # "||licensebuttons.net^",
+            # "||i.creativecommons.org^",
             # Should we skip jquery, or other generic javascript CDNs?
-            #"||code.jquery.com^",
-            #"||ajax.googleapis.com^",
-            #"||cdnjs.cloudflare.com^",
-
+            # "||code.jquery.com^",
+            # "||ajax.googleapis.com^",
+            # "||cdnjs.cloudflare.com^",
             # badges, "share" buttons, tracking, etc
             "apis.google.com/js/plusone",
             "www.google.com/recaptcha/",
             "js/_getUACode.js"
-
             # PLOS images
             "/resource/img/icon.*.16.png^",
+            # CAIRN broken tracking tag
+            "cairn-int.info//about.php?cairn_guest=",
         ],
     )
 
 
-def _extract_generic(doc: HTMLParser, selector: str, attrs: List[str], type_name: str) -> list:
+def _extract_generic(
+    doc: HTMLParser, selector: str, attrs: List[str], type_name: str
+) -> List[Dict[str, str]]:
     resources = []
 
     for node in doc.css(selector):
         for attr in attrs:
-            if not attr in node.attrs:
+            if attr not in node.attrs:
                 continue
             url = node.attrs.get(attr)
+            # special-case a couple meta URI prefixes which don't match with adblock rules
+            skip = False
+            for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
+                if url and url.startswith(prefix):
+                    skip = True
+                    break
+            if url and "/" not in url and "." not in url and " " in url:
+                # eg: "Ce fichier n'existe pas"
+                skip = True
+            if skip:
+                continue
+            if url and url.startswith("https://https://"):
+                url = url[8:]
+            elif url and url.startswith("http://http://"):
+                url = url[7:]
             if url:
-                resources.append(dict(url=url, type=type_name))
+                # print(url, file=sys.stderr)
+                resources.append(dict(url=url.strip(), type=type_name))
 
     return resources
 
 
-def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker) -> list:
+def html_extract_resources(
+    doc_url: str, doc: HTMLParser, adblock: braveblock.Adblocker
+) -> List[Dict[str, str]]:
     """
     This function tries to find all the important resources in a page. The
     presumption is that the HTML document is article fulltext, and we want the
-    list of all resoures (by URL) necessary to replay the page.
+    list of all resources (by URL) necessary to replay the page.
 
     The returned resource URLs each have a type (script, img, css, etc), and
     should be fully-qualified URLs (not relative).
@@ -624,13 +1061,17 @@ def html_extract_resources(doc_url: str, doc: HTMLParser, adblock: braveblock.Ad
 
     # ensure URLs are absolute
     for r in resources:
-        r['url'] = urllib.parse.urljoin(doc_url, r['url'])
+        r["url"] = urllib.parse.urljoin(doc_url, r["url"])
 
     # filter using adblocker
-    resources = [r for r in resources if adblock.check_network_urls(r['url'], source_url=doc_url, request_type=r['type']) == False]
+    resources = [
+        r
+        for r in resources
+        if adblock.check_network_urls(r["url"], source_url=doc_url, request_type=r["type"])
+        is False
+    ]
 
     # remove duplicates
     resources = [dict(t) for t in {tuple(d.items()) for d in resources}]
 
     return resources
-