aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-21 10:36:07 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-21 10:36:09 -0700
commit139ca7e5a90d49c33e23de781b7e4ac21e868fac (patch)
treed974f00e60ee8cec662be03c39ca15c5d2030d47
parent34fa226b27a8597ae1da788a41be2880b1cbf4fc (diff)
downloadsandcrawler-139ca7e5a90d49c33e23de781b7e4ac21e868fac.tar.gz
sandcrawler-139ca7e5a90d49c33e23de781b7e4ac21e868fac.zip
extraction: status reporting tweaks
Improvements to how the extraction function in the extraction script reports status (in output, hbase, and counters)
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py13
1 files changed, 8 insertions, 5 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index ed82a5e..227a026 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -162,16 +162,18 @@ class MRExtractCdxGrobid(MRJob):
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.text)
- info['grobid0:status'] = {'status': 'success'}
- info['grobid0:tei_xml'] = grobid_response.content
+ info['grobid0:status'] = {'status': 'partial'}
+ info['grobid0:tei_xml'] = grobid_response.text
# Convert TEI XML to JSON
try:
- info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True)
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
except xml.etree.ElementTree.ParseError:
- return info, dict(status="fail", reason="GROBID 200 XML parse error")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
except ValueError:
- return info, dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
tei_metadata = info['grobid0:tei_json'].copy()
for k in ('body', 'annex'):
@@ -183,6 +185,7 @@ class MRExtractCdxGrobid(MRJob):
# TODO:
info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
return info, None