aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/extraction_cdx_grobid.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-26 18:24:42 +0000
committerBryan Newbold <bnewbold@archive.org>2018-04-26 18:24:42 +0000
commitee6ce29e7987f936536a0ef128d3a96cc1df3d86 (patch)
tree1d43f4239a359c70dbdb1841f9d7699c502920d3 /mapreduce/extraction_cdx_grobid.py
parentdf23b6f45922875f0bf657aea3b8c3fb4451469d (diff)
downloadsandcrawler-ee6ce29e7987f936536a0ef128d3a96cc1df3d86.tar.gz
sandcrawler-ee6ce29e7987f936536a0ef128d3a96cc1df3d86.zip
XML size limit
Diffstat (limited to 'mapreduce/extraction_cdx_grobid.py')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index 6659f61..e87da40 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -152,6 +152,12 @@ class MRExtractCdxGrobid(MRJob):
return info, dict(status="error", reason="non-200 GROBID HTTP status",
extra=grobid_response.text)
+ # 4 MByte XML size limit
+ if len(grobid_response.content) > 4000000:
+ info['grobid0:status'] = dict(description=grobid_response.text)
+ return info, dict(status="oversize", reason="TEI response was too large",
+ extra=grobid_response.text)
+
info['grobid0:status'] = {'status': 'success'}
info['grobid0:tei_xml'] = grobid_response.content