aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-23 15:58:12 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-23 15:58:12 -0700
commit103c3aeff74d7f820f40d0a77e9d85ade8cb555c (patch)
tree20dcfdd542a15b39cf0c92dcc1088f253a64b95b
parentc6e9aa4226aa8ed02c80e829ddb1d3fd40103017 (diff)
parent6c92ee4c0b137c28abd03ed72190210da8a1e72b (diff)
downloadsandcrawler-103c3aeff74d7f820f40d0a77e9d85ade8cb555c.tar.gz
sandcrawler-103c3aeff74d7f820f40d0a77e9d85ade8cb555c.zip
Merge branch 'bnewbold-extraction-tweaks'
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index ed82a5e..040538c 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -162,16 +162,18 @@ class MRExtractCdxGrobid(MRJob):
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.text)
- info['grobid0:status'] = {'status': 'success'}
+ info['grobid0:status'] = {'status': 'partial'}
info['grobid0:tei_xml'] = grobid_response.content
# Convert TEI XML to JSON
try:
- info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True)
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
except xml.etree.ElementTree.ParseError:
- return info, dict(status="fail", reason="GROBID 200 XML parse error")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
except ValueError:
- return info, dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
tei_metadata = info['grobid0:tei_json'].copy()
for k in ('body', 'annex'):
@@ -183,6 +185,7 @@ class MRExtractCdxGrobid(MRJob):
# TODO:
info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
return info, None