aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html.py17
1 files changed, 9 insertions, 8 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index e3d95bc..cd0a8e8 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -99,14 +99,15 @@ def extract_fulltext_url(html_url, html_body):
if 'sciencedirect.com/science/article/pii/' in html_url and not html_url.endswith(".pdf"):
json_tag = soup.find("script", attrs={"type": "application/json", "data-iso-key": "_0"})
url = None
- try:
- json_text = json_tag.string
- json_meta = json.loads(json_text)
- pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
- # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
- url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
- except (KeyError, TypeError, json.JSONDecodeError):
- pass
+ if json_tag:
+ try:
+ json_text = json_tag.string
+ json_meta = json.loads(json_text)
+ pdf_meta = json_meta['article']['pdfDownload']['urlMetadata']
+ # https://www.sciencedirect.com/science/article/pii/S0169204621000670/pdfft?md5=c4a83d06b334b627ded74cf9423bfa56&pid=1-s2.0-S0169204621000670-main.pdf
+ url = html_url + pdf_meta['pdfExtension'] + "?md5=" + pdf_meta['queryParams']['md5'] + "&pid=" + pdf_meta['queryParams']['pid']
+ except (KeyError, TypeError, json.JSONDecodeError):
+ pass
if url:
return dict(pdf_url=url, technique="sciencedirect-munge-json")