aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/grobid.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/grobid.py')
-rw-r--r--python/sandcrawler/grobid.py105
1 files changed, 55 insertions, 50 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 67aca17..26918f6 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -12,11 +12,11 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class GrobidClient(object):
def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
- self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
+ self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
- def process_fulltext(self,
- blob: bytes,
- consolidate_mode: Optional[int] = None) -> Dict[str, Any]:
+ def process_fulltext(
+ self, blob: bytes, consolidate_mode: Optional[int] = None
+ ) -> Dict[str, Any]:
"""
Returns dict with keys:
- status_code
@@ -36,72 +36,75 @@ class GrobidClient(object):
grobid_response = requests.post(
self.host_url + "/api/processFulltextDocument",
files={
- 'input': blob,
- 'consolidateHeader': consolidate_mode,
- 'consolidateCitations': 0, # too expensive for now
- 'includeRawCitations': 1,
+ "input": blob,
+ "consolidateHeader": consolidate_mode,
+ "consolidateCitations": 0, # too expensive for now
+ "includeRawCitations": 1,
},
timeout=180.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'GROBID request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "GROBID request (HTTP POST) timeout",
}
info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
if grobid_response.status_code == 200:
- info['status'] = 'success'
- info['tei_xml'] = grobid_response.text
- if len(info['tei_xml']) > 12000000:
+ info["status"] = "success"
+ info["tei_xml"] = grobid_response.text
+ if len(info["tei_xml"]) > 12000000:
# XML is larger than Kafka message size, and much larger than
# an article in general; bail out
- info['status'] = 'error'
- info['error_msg'] = "response XML too large: {} bytes".format(
- len(info['tei_xml']))
- info.pop('tei_xml')
+ info["status"] = "error"
+ info["error_msg"] = "response XML too large: {} bytes".format(
+ len(info["tei_xml"])
+ )
+ info.pop("tei_xml")
else:
# response.text is .content decoded as utf-8
- info['status'] = 'error'
- info['error_msg'] = grobid_response.text[:10000]
+ info["status"] = "error"
+ info["error_msg"] = grobid_response.text[:10000]
return info
def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
- if result['status'] != 'success':
+ if result["status"] != "success":
return None
- tei_json = teixml2json(result['tei_xml'], encumbered=False)
+ tei_json = teixml2json(result["tei_xml"], encumbered=False)
meta = dict()
biblio = dict()
for k in (
- 'title',
- 'authors',
- 'journal',
- 'date',
- 'doi',
+ "title",
+ "authors",
+ "journal",
+ "date",
+ "doi",
):
if tei_json.get(k):
biblio[k] = tei_json[k]
- meta['biblio'] = biblio
- for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+ meta["biblio"] = biblio
+ for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
if tei_json.get(k):
meta[k] = tei_json[k]
return meta
class GrobidWorker(SandcrawlerFetchWorker):
- def __init__(self,
- grobid_client: GrobidClient,
- wayback_client: Optional[WaybackClient] = None,
- sink: Optional[SandcrawlerWorker] = None,
- **kwargs):
+ def __init__(
+ self,
+ grobid_client: GrobidClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs
+ ):
super().__init__(wayback_client=wayback_client)
self.grobid_client = grobid_client
self.sink = sink
self.consolidate_mode = 0
def timeout_response(self, task: Any) -> Any:
- default_key = task['sha1hex']
+ default_key = task["sha1hex"]
return dict(
status="error-timeout",
error_msg="internal GROBID worker timeout",
@@ -111,16 +114,17 @@ class GrobidWorker(SandcrawlerFetchWorker):
def process(self, record: Any, key: Optional[str] = None) -> Any:
fetch_result = self.fetch_blob(record)
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob: bytes = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
assert blob and isinstance(blob, bytes)
- result = self.grobid_client.process_fulltext(blob,
- consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['source'] = record
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["source"] = record
+ result["key"] = result["file_meta"]["sha1hex"]
return result
@@ -129,10 +133,10 @@ class GrobidBlobWorker(SandcrawlerWorker):
This is sort of like GrobidWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self,
- grobid_client: GrobidClient,
- sink: Optional[SandcrawlerWorker] = None,
- **kwargs):
+
+ def __init__(
+ self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+ ):
super().__init__()
self.grobid_client = grobid_client
self.sink = sink
@@ -141,8 +145,9 @@ class GrobidBlobWorker(SandcrawlerWorker):
def process(self, blob: Any, key: Optional[str] = None) -> Any:
if not blob:
return None
- result = self.grobid_client.process_fulltext(blob,
- consolidate_mode=self.consolidate_mode)
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
+ result = self.grobid_client.process_fulltext(
+ blob, consolidate_mode=self.consolidate_mode
+ )
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
return result