From 2f854d076125ab5ca3c80e07857a7e5d0fa5aa1e Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 17 Apr 2020 10:09:41 -0700 Subject: fix KeyError in HTML PDF URL extraction --- python/sandcrawler/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index b924a17..8fbb0ba 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -78,7 +78,7 @@ def extract_fulltext_url(html_url, html_body): # https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379 # PDF (1 MB) href = soup.find('a', attrs={"title":"PDF"}) - if href: + if href and 'href' in href: url = href['href'].strip() if url.startswith('http'): return dict(pdf_url=url, technique='href_title') -- cgit v1.2.3