aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/html.py8
1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 8e9eb1f..1893898 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -33,10 +33,16 @@ def extract_fulltext_url(html_url, html_body):
"""
Takes an HTML document (and URL), assumed to be a landing page, and tries
to find a fulltext PDF url.
+
+ On error, or if fails to extract a URL, returns an empty dict.
"""
host_prefix = '/'.join(html_url.split('/')[:3])
- soup = BeautifulSoup(html_body, 'html.parser')
+ try:
+ soup = BeautifulSoup(html_body, 'html.parser')
+ except TypeError as te:
+ print("{} (url={})".format(te, html_url, file=sys.stderr))
+ return dict()
### General Tricks ###