aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py44
1 files changed, 44 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
new file mode 100644
index 0000000..0e37c0e
--- /dev/null
+++ b/python/sandcrawler/grobid.py
@@ -0,0 +1,44 @@
+
+import requests
+
+class GrobidClient(object):
+
+ def __init__(self, host_uri, **kwargs):
+ self.host_uri = host_uri
+ self.consolidate_mode = int(kwargs.get('consolidate_mode', 1))
+
+ def process_fulltext(self, blob, consolidate_mode=None):
+ """
+ Returns dict with keys:
+ - status_code
+ - status (slug)
+ - error_msg (if status == 'error')
+ - tei_xml (if status is 200)
+
+ TODO: persist connection for performance?
+ """
+ assert blob
+
+ if consolidate_mode == None:
+ consolidate_mode = self.consolidate_mode
+
+ grobid_response = requests.post(
+ self.host_uri + "/api/processFulltextDocument",
+ files={
+ 'input': blob,
+ 'consolidate_mode': self.consolidate_mode,
+ }
+ )
+
+ info = dict(
+ status_code=grobid_response.status_code,
+ )
+ if grobid_response.status_code == 200:
+ info['status'] = 'success'
+ info['tei_xml'] = grobid_response.text
+ else:
+ # response.text is .content decoded as utf-8
+ info['status'] = 'error'
+ info['error_msg'] = grobid_response.text[:10000]
+ return info
+