diff options
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r-- | python/sandcrawler/grobid.py | 130 |
1 files changed, 130 insertions, 0 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py new file mode 100644 index 0000000..b4215dc --- /dev/null +++ b/python/sandcrawler/grobid.py @@ -0,0 +1,130 @@ + +import requests + +from grobid2json import teixml2json +from .workers import SandcrawlerWorker, SandcrawlerFetchWorker +from .misc import gen_file_metadata + +class GrobidClient(object): + + def __init__(self, host_url="http://grobid.qa.fatcat.wiki", **kwargs): + self.host_url = host_url + self.consolidate_mode = int(kwargs.get('consolidate_mode', 0)) + + def process_fulltext(self, blob, consolidate_mode=None): + """ + Returns dict with keys: + - status_code + - status (slug) + - error_msg (if status == 'error') + - tei_xml (if status is 200) + + TODO: persist connection for performance? + """ + assert blob + + if consolidate_mode == None: + consolidate_mode = self.consolidate_mode + + try: + grobid_response = requests.post( + self.host_url + "/api/processFulltextDocument", + files={ + 'input': blob, + 'consolidateHeader': self.consolidate_mode, + 'consolidateCitations': 0, # too expensive for now + 'includeRawCitations': 1, + }, + timeout=180.0, + ) + except requests.Timeout: + return { + 'status': 'error-timeout', + 'status_code': -4, # heritrix3 "HTTP timeout" code + 'error_msg': 'GROBID request (HTTP POST) timeout', + } + + info = dict( + status_code=grobid_response.status_code, + ) + if grobid_response.status_code == 200: + info['status'] = 'success' + info['tei_xml'] = grobid_response.text + if len(info['tei_xml']) > 12000000: + # XML is larger than Kafka message size, and much larger than + # an article in general; bail out + info['status'] = 'error' + info['error_msg'] = "response XML too large: {} bytes".format(len(info['tei_xml'])) + info.pop('tei_xml') + else: + # response.text is .content decoded as utf-8 + info['status'] = 'error' + info['error_msg'] = grobid_response.text[:10000] + return info + + def metadata(self, result): + if result['status'] != 'success': + return None + tei_json = teixml2json(result['tei_xml'], encumbered=False) + meta = dict() + biblio = dict() + for k in ('title', 'authors', 'journal', 'date', 'doi', ): + if tei_json.get(k): + biblio[k] = tei_json[k] + meta['biblio'] = biblio + for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'): + if tei_json.get(k): + meta[k] = tei_json[k] + return meta + +class GrobidWorker(SandcrawlerFetchWorker): + + def __init__(self, grobid_client, wayback_client=None, sink=None, **kwargs): + super().__init__(wayback_client=wayback_client) + self.grobid_client = grobid_client + self.sink = sink + self.consolidate_mode = 0 + + def timeout_response(self, task): + default_key = task['sha1hex'] + return dict( + status="error-timeout", + error_msg="internal GROBID worker timeout", + source=task, + key=default_key, + ) + + def process(self, record, key=None): + default_key = record['sha1hex'] + + fetch_result = self.fetch_blob(record) + if fetch_result['status'] != 'success': + return fetch_result + blob = fetch_result['blob'] + + result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) + result['file_meta'] = gen_file_metadata(blob) + result['source'] = record + result['key'] = result['file_meta']['sha1hex'] + return result + +class GrobidBlobWorker(SandcrawlerWorker): + """ + This is sort of like GrobidWorker, except it receives blobs directly, + instead of fetching blobs from some remote store. + """ + + def __init__(self, grobid_client, sink=None, **kwargs): + super().__init__() + self.grobid_client = grobid_client + self.sink = sink + self.consolidate_mode = 0 + + def process(self, blob, key=None): + if not blob: + return None + result = self.grobid_client.process_fulltext(blob, consolidate_mode=self.consolidate_mode) + result['file_meta'] = gen_file_metadata(blob) + result['key'] = result['file_meta']['sha1hex'] + return result + |