From be8f1d134681caaa15485246b65551a67e5bd5a5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 24 Feb 2020 17:38:42 -0800 Subject: ingest: handle missing chemrxvi tag --- python/sandcrawler/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index e993e74..c76d7a2 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -303,7 +303,7 @@ def extract_fulltext_url(html_url, html_body): if "://chemrxiv.org/articles/" in html_url or '.figshare.org/articles/' in html_url: # json_tag = soup.find('script', id="app-data", attrs={"type": "text/json"}) - if json_tag.string: + if json_tag and json_tag.string: app_data = json.loads(json_tag.string) # "exportPdfDownloadUrl": "https://s3-eu-west-1.amazonaws.com/itempdf74155353254prod/10101419/Biradical_Formation_by_Deprotonation_in_Thiazole-Derivatives__The_Hidden_Nature_of_Dasatinib_v1.pdf" url = app_data.get('article', {}).get('exportPdfDownloadUrl') -- cgit v1.2.3