aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-05-08 03:24:01 +0000
committerBryan Newbold <bnewbold@archive.org>2018-05-08 03:24:01 +0000
commit0c398392aa298d28694bf5bd37d3e4912de8a2f5 (patch)
tree1582ed0f90eefecff38dfa8d36567bb6ba6a24d9
parentee6ce29e7987f936536a0ef128d3a96cc1df3d86 (diff)
downloadsandcrawler-0c398392aa298d28694bf5bd37d3e4912de8a2f5.tar.gz
sandcrawler-0c398392aa298d28694bf5bd37d3e4912de8a2f5.zip
actually fix oversize inserts
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py14
1 files changed, 7 insertions, 7 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index e87da40..7771e45 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -146,18 +146,18 @@ class MRExtractCdxGrobid(MRJob):
return None, dict(status="error", reason="connection to GROBID worker")
info['grobid0:status_code'] = grobid_response.status_code
+
+ # 4 MByte XML size limit; don't record GROBID status on this path
+ if len(grobid_response.content) > 4000000:
+ info['grobid0:status'] = {'status': 'oversize'}
+ return info, dict(status="oversize", reason="TEI response was too large")
+
if grobid_response.status_code != 200:
# response.text is .content decoded as utf-8
- info['grobid0:status'] = dict(description=grobid_response.text)
+ info['grobid0:status'] = dict(status='error', description=grobid_response.text)
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.text)
- # 4 MByte XML size limit
- if len(grobid_response.content) > 4000000:
- info['grobid0:status'] = dict(description=grobid_response.text)
- return info, dict(status="oversize", reason="TEI response was too large",
- extra=grobid_response.text)
-
info['grobid0:status'] = {'status': 'success'}
info['grobid0:tei_xml'] = grobid_response.content