From 0c398392aa298d28694bf5bd37d3e4912de8a2f5 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 8 May 2018 03:24:01 +0000 Subject: actually fix oversize inserts --- mapreduce/extraction_cdx_grobid.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index e87da40..7771e45 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -146,18 +146,18 @@ class MRExtractCdxGrobid(MRJob): return None, dict(status="error", reason="connection to GROBID worker") info['grobid0:status_code'] = grobid_response.status_code + + # 4 MByte XML size limit; don't record GROBID status on this path + if len(grobid_response.content) > 4000000: + info['grobid0:status'] = {'status': 'oversize'} + return info, dict(status="oversize", reason="TEI response was too large") + if grobid_response.status_code != 200: # response.text is .content decoded as utf-8 - info['grobid0:status'] = dict(description=grobid_response.text) + info['grobid0:status'] = dict(status='error', description=grobid_response.text) return info, dict(status="error", reason="non-200 GROBID HTTP status", extra=grobid_response.text) - # 4 MByte XML size limit - if len(grobid_response.content) > 4000000: - info['grobid0:status'] = dict(description=grobid_response.text) - return info, dict(status="oversize", reason="TEI response was too large", - extra=grobid_response.text) - info['grobid0:status'] = {'status': 'success'} info['grobid0:tei_xml'] = grobid_response.content -- cgit v1.2.3