From 629290b5713f4d33688fdbda614707d82898b0af Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 2 Jan 2020 18:05:06 -0800 Subject: handle grobid2json errors in calling code instead --- python/sandcrawler/persist.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'python') diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 71ada51..c24dec8 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,6 +20,7 @@ grobid """ import os +import xml.etree.ElementTree from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgresClient @@ -230,7 +231,12 @@ class PersistGrobidWorker(SandcrawlerWorker): self.counts['s3-put'] += 1 # enhance with teixml2json metadata, if available - metadata = self.grobid.metadata(r) + try: + metadata = self.grobid.metadata(r) + except xml.etree.ElementTree.ParseError as xml_e: + r['status'] = 'bad-grobid-xml' + r['metadata'] = {'error_msg': str(xml_e)[:1024]} + continue if not metadata: continue for k in ('fatcat_release', 'grobid_version'): -- cgit v1.2.3