diff options
author | Bryan Newbold <bnewbold@archive.org> | 2022-07-28 11:12:38 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2022-07-28 11:12:38 -0700 |
commit | 8918b4106aa33d936f07df41ac0bdc65825e6ef4 (patch) | |
tree | 1c327cc11ea0fb50ffc586233fde4cb41a37c4d7 | |
parent | 5d744e72573b8dd698c833f8771adf61b8b35c34 (diff) | |
download | sandcrawler-8918b4106aa33d936f07df41ac0bdc65825e6ef4.tar.gz sandcrawler-8918b4106aa33d936f07df41ac0bdc65825e6ef4.zip |
html ingest: handle TEI-XML parse error
-rw-r--r-- | python/sandcrawler/ingest_file.py | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py index 3102ec2..6c72b96 100644 --- a/python/sandcrawler/ingest_file.py +++ b/python/sandcrawler/ingest_file.py @@ -457,7 +457,10 @@ class IngestFileWorker(SandcrawlerWorker): return dict(status="html-selectolax-error") html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio - html_body = html_extract_body_teixml(resource.body) + try: + html_body = html_extract_body_teixml(resource.body) + except xml.etree.ElementTree.ParseError: + return dict(status="html-teixml-error") html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio) html_scope = html_guess_scope( resource.terminal_url, html_doc, html_biblio, html_body.get("word_count") |