From 139ca7e5a90d49c33e23de781b7e4ac21e868fac Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 Aug 2018 10:36:07 -0700 Subject: extraction: status reporting tweaks Improvements to how the extraction function in the extraction script reports status (in output, hbase, and counters) --- mapreduce/extraction_cdx_grobid.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mapreduce/extraction_cdx_grobid.py') diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index ed82a5e..227a026 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -162,16 +162,18 @@ class MRExtractCdxGrobid(MRJob): return info, dict(status="error", reason="non-200 GROBID HTTP status", extra=grobid_response.text) - info['grobid0:status'] = {'status': 'success'} - info['grobid0:tei_xml'] = grobid_response.content + info['grobid0:status'] = {'status': 'partial'} + info['grobid0:tei_xml'] = grobid_response.text # Convert TEI XML to JSON try: - info['grobid0:tei_json'] = teixml2json(grobid_response.content, encumbered=True) + info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True) except xml.etree.ElementTree.ParseError: - return info, dict(status="fail", reason="GROBID 200 XML parse error") + info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error") + return info, info['grobid0:status'] except ValueError: - return info, dict(status="fail", reason="GROBID 200 XML non-TEI content") + info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content") + return info, info['grobid0:status'] tei_metadata = info['grobid0:tei_json'].copy() for k in ('body', 'annex'): @@ -183,6 +185,7 @@ class MRExtractCdxGrobid(MRJob): # TODO: info['grobid0:quality'] = None + info['grobid0:status'] = {'status': 'success'} return info, None -- cgit v1.2.3 From 6c92ee4c0b137c28abd03ed72190210da8a1e72b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 21 Aug 2018 16:28:53 -0700 Subject: extraction: do want content, not text XML can have non-unicode characters? Who knew. --- mapreduce/extraction_cdx_grobid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mapreduce/extraction_cdx_grobid.py') diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 227a026..040538c 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -163,7 +163,7 @@ class MRExtractCdxGrobid(MRJob): extra=grobid_response.text) info['grobid0:status'] = {'status': 'partial'} - info['grobid0:tei_xml'] = grobid_response.text + info['grobid0:tei_xml'] = grobid_response.content # Convert TEI XML to JSON try: -- cgit v1.2.3