ingest: more PDF fulltext tricks

author: Bryan Newbold <bnewbold@archive.org> 2022-07-20 20:41:32 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2022-07-20 20:41:32 -0700
commit: 966df43c77581770df4d83d37afe8ead41d51abb (patch)
tree: a03e53fb9058a4094fcfbb8f2d97ba69e1a1df87
parent: 98b95dea4eafec78f16f6afbabfe65aa2489e78f (diff)
download: sandcrawler-966df43c77581770df4d83d37afe8ead41d51abb.tar.gz
sandcrawler-966df43c77581770df4d83d37afe8ead41d51abb.zip
2 files changed, 36 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 73c808c..207f067 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -323,6 +323,13 @@ def extract_fulltext_url(html_url: str, html_body: bytes) -> Dict[str, str]:
                 technique="google-drive",
             )
 
+    # https://doi.org/10.24850/j-tyca-14-4-7
+    # https://docs.google.com/viewer?url=http://revistatyca.org.mx/index.php/tyca/libraryFiles/downloadPublic/150
+    if "docs.google.com/viewer?url=" in html_url:
+        original_url = html_url.split("?url=")[1]
+        if original_url:
+            return dict(pdf_url=original_url, technique="docs.google.com viewer")
+
     ### below here we are doing guesses
 
     # generic guess: try current URL plus .pdf, if it exists in the HTML body
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index e5a1640..3d9e8ca 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -685,6 +685,30 @@ PDF_FULLTEXT_PATTERNS: List[Dict[str, str]] = [
         "technique": "filclass.ru PDF link",
         "example_page": "https://filclass.ru/en/archive/2018/2-52/the-chronicle-of-domestic-literary-criticism",
     },
+    {
+        "in_doc_url": "cdnsciencepub.com",
+        "in_fulltext_url": "pdf",
+        "selector": "article .info-panel a.btn--pdf",
+        "attr": "href",
+        "technique": "cdnsciencepub.com PDF link",
+        "example_page": "https://cdnsciencepub.com/doi/10.1139/AS-2022-0011",
+    },
+    {
+        "in_doc_url": "grrjournal.com",
+        "in_fulltext_url": "pdf",
+        "selector": ".ereaders-main-section a[download]",
+        "attr": "href",
+        "technique": "grrjournal.com PDF link",
+        "example_page": "https://www.grrjournal.com/article/analysis-of-audiences-uses-and-gratifications-in-the-selected-pakistani-urdu-films",
+    },
+    {
+        "in_doc_url": "/article/view/",
+        "in_fulltext_url": "pdf",
+        "selector": "#articleFullText a.remote_pdf",
+        "attr": "href",
+        "technique": "OJS remote_pdf link",
+        "example_page": "https://www.mediterranea-comunicacion.org/article/view/22240",
+    },
 ]
 
 FULLTEXT_URL_PATTERNS_SKIP: List[str] = [
@@ -801,6 +825,11 @@ def html_extract_fulltext_url(
             # don't link to self, unless no other options
             self_doc_url = (val, pattern.get("technique", "unknown"))
             continue
+
+        # quirks modes / hacks
+        if "drops.dagstuhl.de" in doc_url and val.endswith(".pdf/"):
+            val = val[:-1]
+
         return (val, pattern.get("technique", "unknown"))
     if self_doc_url:
         print("  WARN: returning fulltext URL pointing to self", file=sys.stderr)
author	Bryan Newbold <bnewbold@archive.org>	2022-07-20 20:41:32 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2022-07-20 20:41:32 -0700
commit	966df43c77581770df4d83d37afe8ead41d51abb (patch)
tree	a03e53fb9058a4094fcfbb8f2d97ba69e1a1df87
parent	98b95dea4eafec78f16f6afbabfe65aa2489e78f (diff)
download	sandcrawler-966df43c77581770df4d83d37afe8ead41d51abb.tar.gz sandcrawler-966df43c77581770df4d83d37afe8ead41d51abb.zip