blob: 0e37c0eb2d8571ff717cae32789dd18ef4d557b1 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
import requests
class GrobidClient(object):
def __init__(self, host_uri, **kwargs):
self.host_uri = host_uri
self.consolidate_mode = int(kwargs.get('consolidate_mode', 1))
def process_fulltext(self, blob, consolidate_mode=None):
"""
Returns dict with keys:
- status_code
- status (slug)
- error_msg (if status == 'error')
- tei_xml (if status is 200)
TODO: persist connection for performance?
"""
assert blob
if consolidate_mode == None:
consolidate_mode = self.consolidate_mode
grobid_response = requests.post(
self.host_uri + "/api/processFulltextDocument",
files={
'input': blob,
'consolidate_mode': self.consolidate_mode,
}
)
info = dict(
status_code=grobid_response.status_code,
)
if grobid_response.status_code == 200:
info['status'] = 'success'
info['tei_xml'] = grobid_response.text
else:
# response.text is .content decoded as utf-8
info['status'] = 'error'
info['error_msg'] = grobid_response.text[:10000]
return info
|