diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-04-17 10:09:41 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-04-17 10:09:41 -0700 |
commit | 2f854d076125ab5ca3c80e07857a7e5d0fa5aa1e (patch) | |
tree | faaa2d60d448c292484746b24b742ce0c601f2c7 /python | |
parent | 622c5bb1f9b6f4d773a31ead2fd9b14413a6fb00 (diff) | |
download | sandcrawler-2f854d076125ab5ca3c80e07857a7e5d0fa5aa1e.tar.gz sandcrawler-2f854d076125ab5ca3c80e07857a7e5d0fa5aa1e.zip |
fix KeyError in HTML PDF URL extraction
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html.py | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index b924a17..8fbb0ba 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -78,7 +78,7 @@ def extract_fulltext_url(html_url, html_body): # https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379 # <a href="/doi/pdf/10.1021/acs.estlett.9b00379" title="PDF" target="_blank" class="button_primary"><i class="icon-file-pdf-o"></i><span>PDF (1 MB)</span></a> href = soup.find('a', attrs={"title":"PDF"}) - if href: + if href and 'href' in href: url = href['href'].strip() if url.startswith('http'): return dict(pdf_url=url, technique='href_title') |