diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-26 18:24:42 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-26 18:24:42 +0000 |
commit | ee6ce29e7987f936536a0ef128d3a96cc1df3d86 (patch) | |
tree | 1d43f4239a359c70dbdb1841f9d7699c502920d3 | |
parent | df23b6f45922875f0bf657aea3b8c3fb4451469d (diff) | |
download | sandcrawler-ee6ce29e7987f936536a0ef128d3a96cc1df3d86.tar.gz sandcrawler-ee6ce29e7987f936536a0ef128d3a96cc1df3d86.zip |
XML size limit
-rwxr-xr-x | mapreduce/extraction_cdx_grobid.py | 6 |
1 files changed, 6 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py index 6659f61..e87da40 100755 --- a/mapreduce/extraction_cdx_grobid.py +++ b/mapreduce/extraction_cdx_grobid.py @@ -152,6 +152,12 @@ class MRExtractCdxGrobid(MRJob): return info, dict(status="error", reason="non-200 GROBID HTTP status", extra=grobid_response.text) + # 4 MByte XML size limit + if len(grobid_response.content) > 4000000: + info['grobid0:status'] = dict(description=grobid_response.text) + return info, dict(status="oversize", reason="TEI response was too large", + extra=grobid_response.text) + info['grobid0:status'] = {'status': 'success'} info['grobid0:tei_xml'] = grobid_response.content |