aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-08 22:19:46 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-08 22:19:46 -0800
commit4dcbeb2d044041bd8cfb169374474e8f80ebf989 (patch)
tree1b85c47d656dcb053334b763efd70fda295127d2
parent66adc96e200b9fbcf8029177c7cee12872a1f563 (diff)
downloadsandcrawler-4dcbeb2d044041bd8cfb169374474e8f80ebf989.tar.gz
sandcrawler-4dcbeb2d044041bd8cfb169374474e8f80ebf989.zip
html: handle more traf error cases
-rw-r--r--python/sandcrawler/html_ingest.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 50b193c..7594365 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -25,10 +25,10 @@ def html_extract_body_teixml(doc: bytes) -> dict:
include_comments=False,
include_formatting=True,
)
- except ValueError as ve:
+ except (ValueError, TypeError) as e:
return dict(
status="parse-error",
- error_msg=str(ve)[:1000],
+ error_msg=str(e)[:1000],
)
if tei_xml:
body_txt = teixml_body_text(tei_xml)