From b438f52dbb7578c9a5c2153bc4ba50e33fdae7c3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 23 Sep 2019 22:58:55 -0700 Subject: start refactoring sandcrawler python common code --- python/sandcrawler/grobid.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 python/sandcrawler/grobid.py (limited to 'python/sandcrawler/grobid.py') diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py new file mode 100644 index 0000000..0e37c0e --- /dev/null +++ b/python/sandcrawler/grobid.py @@ -0,0 +1,44 @@ + +import requests + +class GrobidClient(object): + + def __init__(self, host_uri, **kwargs): + self.host_uri = host_uri + self.consolidate_mode = int(kwargs.get('consolidate_mode', 1)) + + def process_fulltext(self, blob, consolidate_mode=None): + """ + Returns dict with keys: + - status_code + - status (slug) + - error_msg (if status == 'error') + - tei_xml (if status is 200) + + TODO: persist connection for performance? + """ + assert blob + + if consolidate_mode == None: + consolidate_mode = self.consolidate_mode + + grobid_response = requests.post( + self.host_uri + "/api/processFulltextDocument", + files={ + 'input': blob, + 'consolidate_mode': self.consolidate_mode, + } + ) + + info = dict( + status_code=grobid_response.status_code, + ) + if grobid_response.status_code == 200: + info['status'] = 'success' + info['tei_xml'] = grobid_response.text + else: + # response.text is .content decoded as utf-8 + info['status'] = 'error' + info['error_msg'] = grobid_response.text[:10000] + return info + -- cgit v1.2.3