aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2022-07-28 11:12:38 -0700
committerBryan Newbold <bnewbold@archive.org>2022-07-28 11:12:38 -0700
commit8918b4106aa33d936f07df41ac0bdc65825e6ef4 (patch)
tree1c327cc11ea0fb50ffc586233fde4cb41a37c4d7
parent5d744e72573b8dd698c833f8771adf61b8b35c34 (diff)
downloadsandcrawler-master.tar.gz
sandcrawler-master.zip
html ingest: handle TEI-XML parse errorHEADmaster
-rw-r--r--python/sandcrawler/ingest_file.py5
1 files changed, 4 insertions, 1 deletions
diff --git a/python/sandcrawler/ingest_file.py b/python/sandcrawler/ingest_file.py
index 3102ec2..6c72b96 100644
--- a/python/sandcrawler/ingest_file.py
+++ b/python/sandcrawler/ingest_file.py
@@ -457,7 +457,10 @@ class IngestFileWorker(SandcrawlerWorker):
return dict(status="html-selectolax-error")
html_biblio = html_extract_biblio(resource.terminal_url, html_doc)
assert html_biblio
- html_body = html_extract_body_teixml(resource.body)
+ try:
+ html_body = html_extract_body_teixml(resource.body)
+ except xml.etree.ElementTree.ParseError:
+ return dict(status="html-teixml-error")
html_platform = html_guess_platform(resource.terminal_url, html_doc, html_biblio)
html_scope = html_guess_scope(
resource.terminal_url, html_doc, html_biblio, html_body.get("word_count")