aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-02-22 12:29:44 -0800
committerBryan Newbold <bnewbold@archive.org>2020-02-22 12:29:44 -0800
commit06d85e6004de36b7162e11d5171e2eab79f9c78a (patch)
tree5ecf945a5f68c71895adb19b8771595c3e2cefc0 /python
parent5bd09c49aa5a29643f45db390ccf2f099b2d143d (diff)
downloadsandcrawler-06d85e6004de36b7162e11d5171e2eab79f9c78a.tar.gz
sandcrawler-06d85e6004de36b7162e11d5171e2eab79f9c78a.zip
html: handle TypeError during bs4 parse
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html.py8
1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 8e9eb1f..1893898 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -33,10 +33,16 @@ def extract_fulltext_url(html_url, html_body):
"""
Takes an HTML document (and URL), assumed to be a landing page, and tries
to find a fulltext PDF url.
+
+ On error, or if fails to extract a URL, returns an empty dict.
"""
host_prefix = '/'.join(html_url.split('/')[:3])
- soup = BeautifulSoup(html_body, 'html.parser')
+ try:
+ soup = BeautifulSoup(html_body, 'html.parser')
+ except TypeError as te:
+ print("{} (url={})".format(te, html_url, file=sys.stderr))
+ return dict()
### General Tricks ###