aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py6
1 files changed, 6 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index d83fedc..31dc270 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -44,6 +44,12 @@ class GrobidClient(object):
if grobid_response.status_code == 200:
info['status'] = 'success'
info['tei_xml'] = grobid_response.text
+ if len(info['tei_xml']) > 19500000:
+ # XML is larger than Kafka message size, and much larger than
+ # an article in general; bail out
+ info['status'] = 'error'
+ info['error_msg'] = "response XML too large: {} bytes".format(len(len(info['tei_xml'])))
+ info.pop('tei_xml')
else:
# response.text is .content decoded as utf-8
info['status'] = 'error'