1 files changed, 202 insertions, 8 deletions
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index c46788e..1e2d197 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -207,7 +207,7 @@ XML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
         "technique": "SciElo XML link",
     },
     {
-        "in_doc_url": "/article/view/",
+        "in_doc_url": "/view/",
         "in_fulltext_url": "viewXML",
         "selector": "a[class='obj_galley_link']",
         "attr": "href",
@@ -255,6 +255,12 @@ HTML_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
         "attr": "href",
         "technique": "dovepress fulltext link",
     },
+    {
+        "in_doc_url": "://doaj.org/article/",
+        "selector": "section.col-md-8 a[target='_blank'].button--primary",
+        "attr": "href",
+        "technique": "doaj.org access link",
+    },
 ]
 
 COMPONENT_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
@@ -325,10 +331,10 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
         "example_page": "https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379",
     },
     {
-        "in_doc_url": "/article/view/",
+        "in_doc_url": "/view/",
         "selector": "a#pdfDownloadLink",
         "attr": "href",
-        "technique": "pdfDownloadLink link",
+        "technique": "OJS pdfDownloadLink link",
         "example_page": "http://www.revistas.unam.mx/index.php/rep/article/view/35503/32336",
     },
     {
@@ -597,13 +603,171 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
         "technique": "PDF Download link (journals.uchicago.edu)",
         "example_page": "https://www.journals.uchicago.edu/doi/10.14318/hau1.1.008",
     },
+    {
+        "in_doc_url": "integrityresjournals.org",
+        "in_fulltext_url": "/article-full-text-pdf/",
+        "selector": "a[target='_blank'].btn-danger",
+        "attr": "href",
+        "technique": "PDF Download link (integrityresjournals.org)",
+        "example_page": "https://integrityresjournals.org/journal/JBBD/article-abstract/750B649A1",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "/download/",
+        "selector": "body.pkp_page_article a.download",
+        "attr": "href",
+        "technique": "OJS PDF Embed",
+        "example_page": "https://periodicals.karazin.ua/language_teaching/article/view/12543/11957",
+    },
+    {
+        "in_doc_url": "/article/view/",
+        "in_fulltext_url": "/article/",
+        "selector": "a.pdf",
+        "attr": "href",
+        "technique": "OJS PDF link",
+    },
+    {
+        "in_doc_url": "scitemed.com/article/",
+        "in_fulltext_url": ".pdf",
+        "selector": "li.tab_pdf_btn a",
+        "attr": "href",
+        "technique": "PDF link (scitemed.com)",
+    },
+    {
+        "in_doc_url": "://doaj.org/article/",
+        "selector": "section.col-md-8 a[target='_blank'].button--primary",
+        "attr": "href",
+        "technique": "doaj.org access link",
+    },
+    {
+        "in_doc_url": "/jvi.aspx",
+        "in_fulltext_url": "download_fulltext",
+        "selector": "div.siteMainWrapper div.siteArticleShare a[target='_blank'].list-group-item",
+        "attr": "href",
+        "technique": "erciyesmedj.com publication system PDF download link",
+    },
+    {
+        "selector": "body embed[alt='pdf']",
+        "attr": "src",
+        "technique": "embed PDF",
+        "example_pdf": "https://www.arkat-usa.org/arkivoc-journal/browse-arkivoc/ark.5550190.0006.913",
+    },
+    {
+        "in_fulltext_url": "viewPDFInterstitial",
+        "in_doc_url": "/view/",
+        "selector": "frameset frame",
+        "attr": "src",
+        "technique": "PDF iframe (viewPDFInterstitial)",
+        "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+    },
+    {
+        # note this one has a special handler
+        "in_doc_url": "viewPDFInterstitial",
+        "in_fulltext_url": "://",
+        "selector": "head meta[http-equiv='refresh']",
+        "attr": "content",
+        "technique": "HTML meta refresh (viewPDFInterstitial)",
+        "example_page": "http://revistaadmmade.estacio.br/index.php/reeduc/article/view/1910/47965873",
+    },
+    {
+        "in_doc_url": "dlib.si/details/",
+        "in_fulltext_url": "PDF",
+        "selector": "body #FilesBox a",
+        "attr": "href",
+        "technique": "dlib.si download links",
+        "example_page": "https://www.dlib.si/details/URN:NBN:SI:DOC-WR9GTSCJ",
+    },
+    {
+        "in_doc_url": "filclass.ru",
+        "in_fulltext_url": "pdf",
+        "selector": "main .pdf-article a.pdficon",
+        "attr": "href",
+        "technique": "filclass.ru PDF link",
+        "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
+    },
+    {
+        "in_doc_url": "cdnsciencepub.com",
+        "in_fulltext_url": "pdf",
+        "selector": "article .info-panel a.btn--pdf",
+        "attr": "href",
+        "technique": "cdnsciencepub.com PDF link",
+        "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+    },
+    {
+        "in_doc_url": "grrjournal.com",
+        "in_fulltext_url": "pdf",
+        "selector": ".ereaders-main-section a[download]",
+        "attr": "href",
+        "technique": "grrjournal.com PDF link",
+        "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "pdf",
+        "selector": "#articleFullText a.remote_pdf",
+        "attr": "href",
+        "technique": "OJS remote_pdf link",
+        "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+    },
+    {
+        "in_doc_url": "worldscientific.com/doi/abs/",
+        "in_fulltext_url": "/reader/",
+        "selector": "article.container .single__download a",
+        "attr": "href",
+        "technique": "worldscientific landing pages",
+        "example_page": "https://www.worldscientific.com/doi/abs/10.1142/S0116110521500098",
+    },
+    {
+        "in_doc_url": "worldscientific.com/doi/",
+        "in_fulltext_url": "/pdf/",
+        "selector": "noscript a[target='_blank']",
+        "attr": "href",
+        "technique": "worldscientific reader",
+        "example_page": "https://www.worldscientific.com/doi/epdf/10.1142/S0116110521500098",
+    },
+    {
+        "in_fulltext_url": "pdf",
+        "selector": ".container .view-content .download-article a",
+        "attr": "href",
+        "technique": "generic download article button",
+        "example_page": "https://science.lpnu.ua/mmc/all-volumes-and-issues/volume-9-number-1-2022/pursuit-differential-game-many-pursuers-and-one",
+    },
+    {
+        "in_fulltext_url": "pdf",
+        "selector": "body a.download-pdf",
+        "attr": "href",
+        "technique": "generic download article button",
+        "example_page": "https://plit-periodical.com.ua/arhiv/struktura-ta-vlastyvosti-materialu-zrazkiv-vyroshchenyh-metodom-selektyvnogo-lazernogo",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "/view/",
+        "selector": "body .entry_details a.pdf",
+        "attr": "href",
+        "technique": "generic OJS/preprints",
+        "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/version/5022",
+    },
+    {
+        "in_doc_url": "/view/",
+        "in_fulltext_url": "/download/",
+        "selector": "body header a.download",
+        "attr": "href",
+        "technique": "generic OJS/preprints PDF Embed",
+        "example_page": "https://preprints.scielo.org/index.php/scielo/preprint/view/4729/9327",
+    },
 ]
 
 FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
     # wiley has a weird almost-blank page we don't want to loop on
-    "://onlinelibrary.wiley.com/doi/pdf/"
-    "://doi.org/"
-    "://dx.doi.org/"
+    "://onlinelibrary.wiley.com/doi/pdf/",
+    "://doi.org/",
+    "://dx.doi.org/",
+    "{'embed': '",
+]
+
+FULLTEXT_URL_PREFIX_SKIP: List[str] = [
+    "javascript:",
+    "about:",
 ]
 
 RELEASE_TYPE_MAP: Dict[str, str] = {
@@ -676,6 +840,9 @@ def html_extract_fulltext_url(
         val = None
         if "attr" in pattern:
             val = elem.attrs.get(pattern["attr"])
+            # handle HTML redirect
+            if val and pattern["attr"] == "content" and "URL=" in val:
+                val = val.split("URL=")[1]
         elif pattern.get("use_body"):
             val = elem.text()
             if "://" not in val:
@@ -687,13 +854,28 @@ def html_extract_fulltext_url(
         if "in_fulltext_url" in pattern:
             if pattern["in_fulltext_url"] not in val:
                 continue
+        skip_matched = False
         for skip_pattern in FULLTEXT_URL_PATTERNS_SKIP:
             if skip_pattern in val.lower():
-                continue
+                skip_matched = True
+                break
+        if skip_matched:
+            continue
+        for skip_pattern in FULLTEXT_URL_PREFIX_SKIP:
+            if val.lower().startswith(skip_pattern):
+                skip_matched = True
+                break
+        if skip_matched:
+            continue
         if url_fuzzy_equal(doc_url, val):
             # don't link to self, unless no other options
             self_doc_url = (val, pattern.get("technique", "unknown"))
             continue
+
+        # quirks modes / hacks
+        if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+            val = val[:-1]
+
         return (val, pattern.get("technique", "unknown"))
     if self_doc_url:
         print("  WARN: returning fulltext URL pointing to self", file=sys.stderr)
@@ -795,6 +977,9 @@ def load_adblock_rules() -> braveblock.Adblocker:
             "||pbs.twimg.com^",
             "||badge.dimensions.ai^",
             "||recaptcha.net^",
+            "||tag.imagino.com^",
+            "||consent.cookiebot.com^",
+            "||recaptcha.net^",
             # not sure about these CC badges (usually via a redirect)
             # "||licensebuttons.net^",
             # "||i.creativecommons.org^",
@@ -808,6 +993,8 @@ def load_adblock_rules() -> braveblock.Adblocker:
             "js/_getUACode.js"
             # PLOS images
             "/resource/img/icon.*.16.png^",
+            # CAIRN broken tracking tag
+            "cairn-int.info//about.php?cairn_guest=",
         ],
     )
 
@@ -824,12 +1011,19 @@ def _extract_generic(
             url = node.attrs.get(attr)
             # special-case a couple meta URI prefixes which don't match with adblock rules
             skip = False
-            for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:"]:
+            for prefix in ["about:", "data:", "magnet:", "urn:", "mailto:", "javascript:"]:
                 if url and url.startswith(prefix):
                     skip = True
                     break
+            if url and "/" not in url and "." not in url and " " in url:
+                # eg: "Ce fichier n'existe pas"
+                skip = True
             if skip:
                 continue
+            if url and url.startswith("https://https://"):
+                url = url[8:]
+            elif url and url.startswith("http://http://"):
+                url = url[7:]
             if url:
                 # print(url, file=sys.stderr)
                 resources.append(dict(url=url.strip(), type=type_name))