diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 22:19:46 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 22:19:46 -0800 |
commit | 4dcbeb2d044041bd8cfb169374474e8f80ebf989 (patch) | |
tree | 1b85c47d656dcb053334b763efd70fda295127d2 /python | |
parent | 66adc96e200b9fbcf8029177c7cee12872a1f563 (diff) | |
download | sandcrawler-4dcbeb2d044041bd8cfb169374474e8f80ebf989.tar.gz sandcrawler-4dcbeb2d044041bd8cfb169374474e8f80ebf989.zip |
html: handle more traf error cases
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 50b193c..7594365 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -25,10 +25,10 @@ def html_extract_body_teixml(doc: bytes) -> dict: include_comments=False, include_formatting=True, ) - except ValueError as ve: + except (ValueError, TypeError) as e: return dict( status="parse-error", - error_msg=str(ve)[:1000], + error_msg=str(e)[:1000], ) if tei_xml: body_txt = teixml_body_text(tei_xml) |