From a49ac726f2c42fcd1bcb6b1882a2d305a1f198e9 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 2 Dec 2019 16:33:17 -0800 Subject: fixes for large GROBID result skip --- python/sandcrawler/grobid.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'python') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 63ca73a..70f7b16 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -43,11 +43,11 @@ class GrobidClient(object): if grobid_response.status_code == 200: info['status'] = 'success' info['tei_xml'] = grobid_response.text - if len(info['tei_xml']) > 19500000: + if len(info['tei_xml']) > 12000000: # XML is larger than Kafka message size, and much larger than # an article in general; bail out info['status'] = 'error' - info['error_msg'] = "response XML too large: {} bytes".format(len(len(info['tei_xml']))) + info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml'])) info.pop('tei_xml') else: # response.text is .content decoded as utf-8 -- cgit v1.2.3