From 2f854d076125ab5ca3c80e07857a7e5d0fa5aa1e Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Fri, 17 Apr 2020 10:09:41 -0700
Subject: fix KeyError in HTML PDF URL extraction

---
 python/sandcrawler/html.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index b924a17..8fbb0ba 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -78,7 +78,7 @@ def extract_fulltext_url(html_url, html_body):
     #   https://pubs.acs.org/doi/10.1021/acs.estlett.9b00379
     #   <a href="/doi/pdf/10.1021/acs.estlett.9b00379" title="PDF" target="_blank" class="button_primary"><i class="icon-file-pdf-o"></i><span>PDF (1 MB)</span></a>
     href = soup.find('a', attrs={"title":"PDF"})
-    if href:
+    if href and 'href' in href:
         url = href['href'].strip()
         if url.startswith('http'):
             return dict(pdf_url=url, technique='href_title')
-- 
cgit v1.2.3