aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-12 14:55:53 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-12 14:55:53 -0800
commitbd8b0b4d0d1fb04e34002dd3da91d83b922d0991 (patch)
treeee81fe49e2e287e4d763c15856b901832d13287b
parent807233c4625dede9399b73b58b014bc5ce3abcda (diff)
downloadsandcrawler-bd8b0b4d0d1fb04e34002dd3da91d83b922d0991.tar.gz
sandcrawler-bd8b0b4d0d1fb04e34002dd3da91d83b922d0991.zip
grobid: handle XML parsing errors, and have them recorded in sandcrawler-db
-rw-r--r--python/sandcrawler/grobid.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 791e0fe..8c575bc 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -157,7 +157,11 @@ class GrobidClient(object):
def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
if result["status"] != "success":
return None
- tei_doc = parse_document_xml(result["tei_xml"])
+ try:
+ tei_doc = parse_document_xml(result["tei_xml"])
+ except xml.etree.ElementTree.ParseError as pe:
+ result['status'] = 'bad-grobid-xml'
+ return dict(error_msg=str(pe)[:1000])
tei_doc.remove_encumbered()
tei_json = tei_doc.to_legacy_dict()
meta = dict()