aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/html_ingest.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r--python/sandcrawler/html_ingest.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 7594365..91b9cd6 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -25,9 +25,9 @@ def html_extract_body_teixml(doc: bytes) -> dict:
include_comments=False,
include_formatting=True,
)
- except (ValueError, TypeError) as e:
+ except (ValueError, TypeError, Exception) as e:
return dict(
- status="parse-error",
+ status="trafilatura-parse-error",
error_msg=str(e)[:1000],
)
if tei_xml: