diff options
Diffstat (limited to 'python/sandcrawler/html_ingest.py')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 7594365..91b9cd6 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -25,9 +25,9 @@ def html_extract_body_teixml(doc: bytes) -> dict: include_comments=False, include_formatting=True, ) - except (ValueError, TypeError) as e: + except (ValueError, TypeError, Exception) as e: return dict( - status="parse-error", + status="trafilatura-parse-error", error_msg=str(e)[:1000], ) if tei_xml: |