1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 8e9eb1f..1893898 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -33,10 +33,16 @@ def extract_fulltext_url(html_url, html_body):
     """
     Takes an HTML document (and URL), assumed to be a landing page, and tries
     to find a fulltext PDF url.
+
+    On error, or if fails to extract a URL, returns an empty dict.
     """
 
     host_prefix = '/'.join(html_url.split('/')[:3])
-    soup = BeautifulSoup(html_body, 'html.parser')
+    try:
+        soup = BeautifulSoup(html_body, 'html.parser')
+    except TypeError as te:
+        print("{} (url={})".format(te, html_url, file=sys.stderr))
+        return dict()
 
     ### General Tricks ###