diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:05:06 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:59 -0800 |
commit | 629290b5713f4d33688fdbda614707d82898b0af (patch) | |
tree | 2e85b112a391fbf9bdeffa196eb5f3cd34ea05ea | |
parent | 5dc1a8642077b67f3af0a41cdac851bb96a435b7 (diff) | |
download | sandcrawler-629290b5713f4d33688fdbda614707d82898b0af.tar.gz sandcrawler-629290b5713f4d33688fdbda614707d82898b0af.zip |
handle grobid2json errors in calling code instead
-rw-r--r-- | python/sandcrawler/persist.py | 8 |
1 files changed, 7 insertions, 1 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 71ada51..c24dec8 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -20,6 +20,7 @@ grobid """ import os +import xml.etree.ElementTree from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgresClient @@ -230,7 +231,12 @@ class PersistGrobidWorker(SandcrawlerWorker): self.counts['s3-put'] += 1 # enhance with teixml2json metadata, if available - metadata = self.grobid.metadata(r) + try: + metadata = self.grobid.metadata(r) + except xml.etree.ElementTree.ParseError as xml_e: + r['status'] = 'bad-grobid-xml' + r['metadata'] = {'error_msg': str(xml_e)[:1024]} + continue if not metadata: continue for k in ('fatcat_release', 'grobid_version'): |