diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 03:24:01 +0000 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2018-05-08 03:24:01 +0000 | 
| commit | 0c398392aa298d28694bf5bd37d3e4912de8a2f5 (patch) | |
| tree | 1582ed0f90eefecff38dfa8d36567bb6ba6a24d9 /mapreduce | |
| parent | ee6ce29e7987f936536a0ef128d3a96cc1df3d86 (diff) | |
| download | sandcrawler-0c398392aa298d28694bf5bd37d3e4912de8a2f5.tar.gz sandcrawler-0c398392aa298d28694bf5bd37d3e4912de8a2f5.zip  | |
actually fix oversize inserts
Diffstat (limited to 'mapreduce')
| -rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 14 | 
1 files changed, 7 insertions, 7 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index e87da40..7771e45 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -146,18 +146,18 @@ class MRExtractCdxGrobid(MRJob):              return None, dict(status="error", reason="connection to GROBID worker")          info['grobid0:status_code'] = grobid_response.status_code + +        # 4 MByte XML size limit; don't record GROBID status on this path +        if len(grobid_response.content) > 4000000: +            info['grobid0:status'] = {'status': 'oversize'} +            return info, dict(status="oversize", reason="TEI response was too large") +          if grobid_response.status_code != 200:              # response.text is .content decoded as utf-8 -            info['grobid0:status'] = dict(description=grobid_response.text) +            info['grobid0:status'] = dict(status='error', description=grobid_response.text)              return info, dict(status="error", reason="non-200 GROBID HTTP status",                  extra=grobid_response.text) -        # 4 MByte XML size limit -        if len(grobid_response.content) > 4000000: -            info['grobid0:status'] = dict(description=grobid_response.text) -            return info, dict(status="oversize", reason="TEI response was too large", -                extra=grobid_response.text) -          info['grobid0:status'] = {'status': 'success'}          info['grobid0:tei_xml'] = grobid_response.content  | 
