diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-02 16:33:17 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-12-02 16:33:17 -0800 |
commit | a49ac726f2c42fcd1bcb6b1882a2d305a1f198e9 (patch) | |
tree | c452176cade52e7c5fe8891060fb197de3ceef51 /python | |
parent | 6f4f375529e99cbb9c06e49805a8925ffeda269a (diff) | |
download | sandcrawler-a49ac726f2c42fcd1bcb6b1882a2d305a1f198e9.tar.gz sandcrawler-a49ac726f2c42fcd1bcb6b1882a2d305a1f198e9.zip |
fixes for large GROBID result skip
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/grobid.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py index 63ca73a..70f7b16 100644 --- a/python/sandcrawler/grobid.py +++ b/python/sandcrawler/grobid.py @@ -43,11 +43,11 @@ class GrobidClient(object): if grobid_response.status_code == 200: info['status'] = 'success' info['tei_xml'] = grobid_response.text - if len(info['tei_xml']) > 19500000: + if len(info['tei_xml']) > 12000000: # XML is larger than Kafka message size, and much larger than # an article in general; bail out info['status'] = 'error' - info['error_msg'] = "response XML too large: {} bytes".format(len(len(info['tei_xml']))) + info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml'])) info.pop('tei_xml') else: # response.text is .content decoded as utf-8 |