aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index ed82a5e..227a026 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -162,16 +162,18 @@ class MRExtractCdxGrobid(MRJob):
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.text)
- info['grobid0:status'] = {'status': 'success'}
- info['grobid0:tei_xml'] = grobid_response.content
+ info['grobid0:status'] = {'status': 'partial'}
+ info['grobid0:tei_xml'] = grobid_response.text
# Convert TEI XML to JSON
try:
- info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True)
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
except xml.etree.ElementTree.ParseError:
- return info, dict(status="fail", reason="GROBID 200 XML parse error")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
except ValueError:
- return info, dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
tei_metadata = info['grobid0:tei_json'].copy()
for k in ('body', 'annex'):
@@ -183,6 +185,7 @@ class MRExtractCdxGrobid(MRJob):
# TODO:
info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
return info, None