aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-02 16:33:17 -0800
committerBryan Newbold <bnewbold@archive.org>2019-12-02 16:33:17 -0800
commita49ac726f2c42fcd1bcb6b1882a2d305a1f198e9 (patch)
treec452176cade52e7c5fe8891060fb197de3ceef51 /python
parent6f4f375529e99cbb9c06e49805a8925ffeda269a (diff)
downloadsandcrawler-a49ac726f2c42fcd1bcb6b1882a2d305a1f198e9.tar.gz
sandcrawler-a49ac726f2c42fcd1bcb6b1882a2d305a1f198e9.zip
fixes for large GROBID result skip
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/grobid.py4
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 63ca73a..70f7b16 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -43,11 +43,11 @@ class GrobidClient(object):
if grobid_response.status_code == 200:
info['status'] = 'success'
info['tei_xml'] = grobid_response.text
- if len(info['tei_xml']) > 19500000:
+ if len(info['tei_xml']) > 12000000:
# XML is larger than Kafka message size, and much larger than
# an article in general; bail out
info['status'] = 'error'
- info['error_msg'] = "response XML too large: {} bytes".format(len(len(info['tei_xml'])))
+ info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
info.pop('tei_xml')
else:
# response.text is .content decoded as utf-8