From 06d85e6004de36b7162e11d5171e2eab79f9c78a Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Sat, 22 Feb 2020 12:29:44 -0800
Subject: html: handle TypeError during bs4 parse

---
 python/sandcrawler/html.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'python')

diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 8e9eb1f..1893898 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -33,10 +33,16 @@ def extract_fulltext_url(html_url, html_body):
     """
     Takes an HTML document (and URL), assumed to be a landing page, and tries
     to find a fulltext PDF url.
+
+    On error, or if fails to extract a URL, returns an empty dict.
     """
 
     host_prefix = '/'.join(html_url.split('/')[:3])
-    soup = BeautifulSoup(html_body, 'html.parser')
+    try:
+        soup = BeautifulSoup(html_body, 'html.parser')
+    except TypeError as te:
+        print("{} (url={})".format(te, html_url, file=sys.stderr))
+        return dict()
 
     ### General Tricks ###
 
-- 
cgit v1.2.3