diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-19 10:58:43 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-19 10:58:43 -0700 |
commit | db97d3dcfa497c6865399351fa8702b92a940459 (patch) | |
tree | 54c9271d1f29b7d29897bfab5ae6df5e7df399b1 | |
parent | d60a8d6b2380a5d6599203787da59c57a8664322 (diff) | |
download | sandcrawler-db97d3dcfa497c6865399351fa8702b92a940459.tar.gz sandcrawler-db97d3dcfa497c6865399351fa8702b92a940459.zip |
handle UnboundLocalError in HTML parsing
-rw-r--r-- | python/sandcrawler/html.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py index 3eadc7b..88ea41b 100644 --- a/python/sandcrawler/html.py +++ b/python/sandcrawler/html.py @@ -42,7 +42,10 @@ def extract_fulltext_url(html_url, html_body): try: soup = BeautifulSoup(html_body, 'html.parser') except TypeError as te: - print("{} (url={})".format(te, html_url, file=sys.stderr)) + print(f"{te} (url={html_url})", file=sys.stderr) + return dict() + except UnboundLocalError as ule: + print(f"{ule} (url={html_url})", file=sys.stderr) return dict() ### General Tricks ### |