From 1263ee33535d232d702324980e7ff69305ed8795 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 21 May 2021 17:41:41 -0700
Subject: ingest PDF extraction updates

---
 python/sandcrawler/html.py          | 17 ++++++++++++
 python/sandcrawler/html_metadata.py | 54 +++++++++++++++++++++++++++++++++++++
 python/sandcrawler/ingest.py        |  5 ++--
 3 files changed, 74 insertions(+), 2 deletions(-)

(limited to 'python')

diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index d3f5cfe..ca600e4 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -94,6 +94,23 @@ def extract_fulltext_url(html_url, html_body):
                 url = url.split('?via')[0]
                 return dict(next_url=url, technique="elsevier-linkinghub")
 
+    # sciencedirect PDF URL extract
+    # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+    if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"):
+        json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
+        url = None
+        try:
+            json_text = json_tag.string
+            json_meta = json.loads(json_text)
+            pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
+            print(pdf_meta, file=sys.stderr)
+            # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+            url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+        except Exception as e:
+            raise e
+        if url:
+            return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
     # sciencedirect PDF bounce page
     # https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
     if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 1e58778..c805f0a 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -383,6 +383,60 @@ PDF_FULLTEXT_PATTERNS: List[dict] = [
         "technique": "PDF URL link",
         "example_page": "https://www.degruyter.com/document/doi/10.1515/zaw-2021-0001/html",
     },
+    {
+        "in_doc_url": "repositorio.unicamp.br/handle/",
+        "in_fulltext_url": "/bitstream/",
+        "selector": "table.panel-body a[target='_blank']",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "http://www.repositorio.unicamp.br/handle/REPOSIP/287750",
+    },
+    {
+        "in_doc_url": "dlc.library.columbia.edu/durst/",
+        "selector": "dd.blacklight-lib_non_item_in_context_url_ssm a[href]",
+        "attr": "href",
+        "technique": "Access URL link",
+        "example_page": "https://dlc.library.columbia.edu/durst/cul:18931zcrk9",
+    },
+    {
+        "in_doc_url": "fldeploc.dep.state.fl.us/geodb_query/fgs_doi",
+        "in_fulltext_url": "pdf",
+        "selector": "p a[href]",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "http://fldeploc.dep.state.fl.us/geodb_query/fgs_doi.asp?searchCode=IC29",
+    },
+    {
+        "in_doc_url": "preprints.jmir.org/preprint/",
+        "selector": "a.pdf-download-button",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://preprints.jmir.org/preprint/22556",
+    },
+    {
+        "in_doc_url": "bloomsburycollections.com/",
+        "in_fulltext_url": "pdf",
+        "selector": "li.download-item a[href]",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://www.bloomsburycollections.com/book/the-political-economies-of-media-the-transformation-of-the-global-media-industries/the-political-economies-of-media-and-the-transformation-of-the-global-media-industries",
+    },
+    {
+        "in_doc_url": "emerald.com/insight/content/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.intent_pdf_link",
+        "attr": "href",
+        "technique": "PDF URL link",
+        "example_page": "https://www.emerald.com/insight/content/doi/10.1108/RAMJ-11-2020-0065/full/html",
+    },
+    {
+        "in_doc_url": "ingentaconnect.com/content/",
+        "in_fulltext_url": "pdf",
+        "selector": "a.pdf[data-popup]",
+        "attr": "data-popup",
+        "technique": "PDF URL link",
+        "example_page": "https://www.ingentaconnect.com/content/ista/sst/2021/00000049/00000001/art00007",
+    },
 ]
 
 FULLTEXT_URL_PATTERNS_SKIP = [
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index eb8e256..b610ab4 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -128,8 +128,9 @@ class IngestFileWorker(SandcrawlerWorker):
         self.wall_blocklist = [
             # loginwall
             "://profile.thieme.de/HTML/sso/ejournals/login.htm",
-            "://login.bepress.com/"
-            "?SAMLRequest="
+            "://login.bepress.com/",
+            "?SAMLRequest=",
+            "://osapublishing.org/captcha/",
         ]
 
         # these are special-case web domains for which we want SPN2 to not run
-- 
cgit v1.2.3