ingest PDF extraction updates

author: Bryan Newbold <bnewbold@archive.org> 2021-05-21 17:41:41 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-05-21 17:41:41 -0700
commit: 1263ee33535d232d702324980e7ff69305ed8795 (patch)
tree: f4ec34e52aec28c42ba432fab2945419a3658d3f /python/sandcrawler/html.py
parent: 071af9a4832dcb24be417de9b658d678056b5bf2 (diff)
download: sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.tar.gz
sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.zip
1 files changed, 17 insertions, 0 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index d3f5cfe..ca600e4 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -94,6 +94,23 @@ def extract_fulltext_url(html_url, html_body):
                 url = url.split('?via')[0]
                 return dict(next_url=url, technique="elsevier-linkinghub")
 
+    # sciencedirect PDF URL extract
+    # https://www.sciencedirect.com/science/article/pii/S0169204621000670
+    if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"):
+        json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
+        url = None
+        try:
+            json_text = json_tag.string
+            json_meta = json.loads(json_text)
+            pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
+            print(pdf_meta, file=sys.stderr)
+            # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+            url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+        except Exception as e:
+            raise e
+        if url:
+            return dict(pdf_url=url, technique="sciencedirect-munge-json")
+
     # sciencedirect PDF bounce page
     # https://www.sciencedirect.com/science/article/pii/S2590109519300424/pdfft?md5=854f43a44de186eb58674b8e20631691&pid=1-s2.0-S2590109519300424-main.pdf
     if '://www.sciencedirect.com/' in html_url and html_url.endswith(".pdf"):
author	Bryan Newbold <bnewbold@archive.org>	2021-05-21 17:41:41 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-05-21 17:41:41 -0700
commit	1263ee33535d232d702324980e7ff69305ed8795 (patch)
tree	f4ec34e52aec28c42ba432fab2945419a3658d3f /python/sandcrawler/html.py
parent	071af9a4832dcb24be417de9b658d678056b5bf2 (diff)
download	sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.tar.gz sandcrawler-1263ee33535d232d702324980e7ff69305ed8795.zip