old HTML extractors: handle null tag

author: Bryan Newbold <bnewbold@archive.org> 2021-09-08 19:36:43 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-09-08 19:36:43 -0700
commit: cc240ab5d5c3a9970333abdc0b16867a0dc5c418 (patch)
tree: b310513ef58f02580889b11950ac43f87f213160 /python
parent: 047cfcebd8ff792d6da9c10b0bee27ec689a4e9f (diff)
download: sandcrawler-cc240ab5d5c3a9970333abdc0b16867a0dc5c418.tar.gz
sandcrawler-cc240ab5d5c3a9970333abdc0b16867a0dc5c418.zip
1 files changed, 9 insertions, 8 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index e3d95bc..cd0a8e8 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -99,14 +99,15 @@ def extract_fulltext_url(html_url, html_body):
     if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"):
         json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
         url = None
-        try:
-            json_text = json_tag.string
-            json_meta = json.loads(json_text)
-            pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
-            # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
-            url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
-        except (KeyError, TypeError, json.JSONDecodeError):
-            pass
+        if json_tag:
+            try:
+                json_text = json_tag.string
+                json_meta = json.loads(json_text)
+                pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
+                # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+                url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+            except (KeyError, TypeError, json.JSONDecodeError):
+                pass
         if url:
             return dict(pdf_url=url, technique="sciencedirect-munge-json")
author	Bryan Newbold <bnewbold@archive.org>	2021-09-08 19:36:43 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-09-08 19:36:43 -0700
commit	cc240ab5d5c3a9970333abdc0b16867a0dc5c418 (patch)
tree	b310513ef58f02580889b11950ac43f87f213160 /python
parent	047cfcebd8ff792d6da9c10b0bee27ec689a4e9f (diff)
download	sandcrawler-cc240ab5d5c3a9970333abdc0b16867a0dc5c418.tar.gz sandcrawler-cc240ab5d5c3a9970333abdc0b16867a0dc5c418.zip