aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py130
1 files changed, 130 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
new file mode 100644
index 0000000..b4215dc
--- /dev/null
+++ b/python/sandcrawler/grobid.py
@@ -0,0 +1,130 @@
+
+import requests
+
+from grobid2json import teixml2json
+from .workers import SandcrawlerWorker, SandcrawlerFetchWorker
+from .misc import gen_file_metadata
+
+class GrobidClient(object):
+
+ def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs):
+ self.host_url = host_url
+ self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
+
+ def process_fulltext(self, blob, consolidate_mode=None):
+ """
+ Returns dict with keys:
+ - status_code
+ - status (slug)
+ - error_msg (if status == 'error')
+ - tei_xml (if status is 200)
+
+ TODO: persist connection for performance?
+ """
+ assert blob
+
+ if consolidate_mode == None:
+ consolidate_mode = self.consolidate_mode
+
+ try:
+ grobid_response = requests.post(
+ self.host_url + "/api/processFulltextDocument",
+ files={
+ 'input': blob,
+ 'consolidateHeader': self.consolidate_mode,
+ 'consolidateCitations': 0, # too expensive for now
+ 'includeRawCitations': 1,
+ },
+ timeout=180.0,
+ )
+ except requests.Timeout:
+ return {
+ 'status': 'error-timeout',
+ 'status_code': -4, # heritrix3 "HTTP timeout" code
+ 'error_msg': 'GROBID request (HTTP POST) timeout',
+ }
+
+ info = dict(
+ status_code=grobid_response.status_code,
+ )
+ if grobid_response.status_code == 200:
+ info['status'] = 'success'
+ info['tei_xml'] = grobid_response.text
+ if len(info['tei_xml']) > 12000000:
+ # XML is larger than Kafka message size, and much larger than
+ # an article in general; bail out
+ info['status'] = 'error'
+ info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml']))
+ info.pop('tei_xml')
+ else:
+ # response.text is .content decoded as utf-8
+ info['status'] = 'error'
+ info['error_msg'] = grobid_response.text[:10000]
+ return info
+
+ def metadata(self, result):
+ if result['status'] != 'success':
+ return None
+ tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ meta = dict()
+ biblio = dict()
+ for k in ('title', 'authors', 'journal', 'date', 'doi', ):
+ if tei_json.get(k):
+ biblio[k] = tei_json[k]
+ meta['biblio'] = biblio
+ for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ if tei_json.get(k):
+ meta[k] = tei_json[k]
+ return meta
+
+class GrobidWorker(SandcrawlerFetchWorker):
+
+ def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs):
+ super().__init__(wayback_client=wayback_client)
+ self.grobid_client = grobid_client
+ self.sink = sink
+ self.consolidate_mode = 0
+
+ def timeout_response(self, task):
+ default_key = task['sha1hex']
+ return dict(
+ status="error-timeout",
+ error_msg="internal GROBID worker timeout",
+ source=task,
+ key=default_key,
+ )
+
+ def process(self, record, key=None):
+ default_key = record['sha1hex']
+
+ fetch_result = self.fetch_blob(record)
+ if fetch_result['status'] != 'success':
+ return fetch_result
+ blob = fetch_result['blob']
+
+ result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['source'] = record
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+
+class GrobidBlobWorker(SandcrawlerWorker):
+ """
+ This is sort of like GrobidWorker, except it receives blobs directly,
+ instead of fetching blobs from some remote store.
+ """
+
+ def __init__(self, grobid_client, sink=None, **kwargs):
+ super().__init__()
+ self.grobid_client = grobid_client
+ self.sink = sink
+ self.consolidate_mode = 0
+
+ def process(self, blob, key=None):
+ if not blob:
+ return None
+ result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode)
+ result['file_meta'] = gen_file_metadata(blob)
+ result['key'] = result['file_meta']['sha1hex']
+ return result
+