diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-11-12 14:55:53 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-11-12 14:55:53 -0800 |
commit | bd8b0b4d0d1fb04e34002dd3da91d83b922d0991 (patch) | |
tree | ee81fe49e2e287e4d763c15856b901832d13287b | |
parent | 807233c4625dede9399b73b58b014bc5ce3abcda (diff) | |
download | sandcrawler-bd8b0b4d0d1fb04e34002dd3da91d83b922d0991.tar.gz sandcrawler-bd8b0b4d0d1fb04e34002dd3da91d83b922d0991.zip |
grobid: handle XML parsing errors, and have them recorded in sandcrawler-db
-rw-r--r-- | python/sandcrawler/grobid.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 791e0fe..8c575bc 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -157,7 +157,11 @@ class GrobidClient(object): def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]: if result["status"] != "success": return None - tei_doc = parse_document_xml(result["tei_xml"]) + try: + tei_doc = parse_document_xml(result["tei_xml"]) + except xml.etree.ElementTree.ParseError as pe: + result['status'] = 'bad-grobid-xml' + return dict(error_msg=str(pe)[:1000]) tei_doc.remove_encumbered() tei_json = tei_doc.to_legacy_dict() meta = dict() |