aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-19 10:58:43 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-19 10:58:43 -0700
commitdb97d3dcfa497c6865399351fa8702b92a940459 (patch)
tree54c9271d1f29b7d29897bfab5ae6df5e7df399b1
parentd60a8d6b2380a5d6599203787da59c57a8664322 (diff)
downloadsandcrawler-db97d3dcfa497c6865399351fa8702b92a940459.tar.gz
sandcrawler-db97d3dcfa497c6865399351fa8702b92a940459.zip
handle UnboundLocalError in HTML parsing
-rw-r--r--python/sandcrawler/html.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/html.py b/python/sandcrawler/html.py
index 3eadc7b..88ea41b 100644
--- a/python/sandcrawler/html.py
+++ b/python/sandcrawler/html.py
@@ -42,7 +42,10 @@ def extract_fulltext_url(html_url, html_body):
try:
soup = BeautifulSoup(html_body, 'html.parser')
except TypeError as te:
- print("{} (url={})".format(te, html_url, file=sys.stderr))
+ print(f"{te} (url={html_url})", file=sys.stderr)
+ return dict()
+ except UnboundLocalError as ule:
+ print(f"{ule} (url={html_url})", file=sys.stderr)
return dict()
### General Tricks ###