From bd8b0b4d0d1fb04e34002dd3da91d83b922d0991 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 12 Nov 2021 14:55:53 -0800 Subject: grobid: handle XML parsing errors, and have them recorded in sandcrawler-db --- python/sandcrawler/grobid.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 791e0fe..8c575bc 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -157,7 +157,11 @@ class GrobidClient(object): def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]: if result["status"] != "success": return None - tei_doc = parse_document_xml(result["tei_xml"]) + try: + tei_doc = parse_document_xml(result["tei_xml"]) + except xml.etree.ElementTree.ParseError as pe: + result['status'] = 'bad-grobid-xml' + return dict(error_msg=str(pe)[:1000]) tei_doc.remove_encumbered() tei_json = tei_doc.to_legacy_dict() meta = dict() -- cgit v1.2.3