From aa49ab86e4a86067ba2346d8bccf389be940b8e2 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Sun, 1 Dec 2019 15:42:05 -0800 Subject: filter out very large GROBID XML bodies This is to prevent Kafka MSG_SIZE_TOO_LARGE publish errors. We should probably bump this in the future. Open problems: hand-coding this size number isn't good, need to update in two places. Shouldn't filter out for non-Kafka sinks. Might still exist a corner-case where JSON encoded XML is larger than XML character string, due to encoding (eg, for unicode characters). --- python/sandcrawler/grobid.py | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'python') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index d83fedc..31dc270 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -44,6 +44,12 @@ class GrobidClient(object): if grobid_response.status_code == 200: info['status'] = 'success' info['tei_xml'] = grobid_response.text + if len(info['tei_xml']) > 19500000: + # XML is larger than Kafka message size, and much larger than + # an article in general; bail out + info['status'] = 'error' + info['error_msg'] = "response XML too large: {} bytes".format(len(len(info['tei_xml']))) + info.pop('tei_xml') else: # response.text is .content decoded as utf-8 info['status'] = 'error' -- cgit v1.2.3