diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 15:58:12 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 15:58:12 -0700 |
commit | 103c3aeff74d7f820f40d0a77e9d85ade8cb555c (patch) | |
tree | 20dcfdd542a15b39cf0c92dcc1088f253a64b95b | |
parent | c6e9aa4226aa8ed02c80e829ddb1d3fd40103017 (diff) | |
parent | 6c92ee4c0b137c28abd03ed72190210da8a1e72b (diff) | |
download | sandcrawler-103c3aeff74d7f820f40d0a77e9d85ade8cb555c.tar.gz sandcrawler-103c3aeff74d7f820f40d0a77e9d85ade8cb555c.zip |
Merge branch 'bnewbold-extraction-tweaks'
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 11 |
1 files changed, 7 insertions, 4 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index ed82a5e..040538c 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -162,16 +162,18 @@ class MRExtractCdxGrobid(MRJob): return info, dict(status="error", reason="non-200 GROBID HTTP status", extra=grobid_response.text) - info['grobid0:status'] = {'status': 'success'} + info['grobid0:status'] = {'status': 'partial'} info['grobid0:tei_xml'] = grobid_response.content # Convert TEI XML to JSON try: - info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True) + info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True) except xml.etree.ElementTree.ParseError: - return info, dict(status="fail", reason="GROBID 200 XML parse error") + info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error") + return info, info['grobid0:status'] except ValueError: - return info, dict(status="fail", reason="GROBID 200 XML non-TEI content") + info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content") + return info, info['grobid0:status'] tei_metadata = info['grobid0:tei_json'].copy() for k in ('body', 'annex'): @@ -183,6 +185,7 @@ class MRExtractCdxGrobid(MRJob): # TODO: info['grobid0:quality'] = None + info['grobid0:status'] = {'status': 'success'} return info, None |