1 files changed, 55 insertions, 50 deletions
diff --git a/python/sandcrawler/grobid.py b/python/sandcrawler/grobid.py
index 67aca17..26918f6 100644
--- a/python/sandcrawler/grobid.py
+++ b/python/sandcrawler/grobid.py
@@ -12,11 +12,11 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
 class GrobidClient(object):
     def __init__(self, host_url: str = "http://grobid.qa.fatcat.wiki", **kwargs):
         self.host_url = host_url
-        self.consolidate_mode = int(kwargs.get('consolidate_mode', 0))
+        self.consolidate_mode = int(kwargs.get("consolidate_mode", 0))
 
-    def process_fulltext(self,
-                         blob: bytes,
-                         consolidate_mode: Optional[int] = None) -> Dict[str, Any]:
+    def process_fulltext(
+        self, blob: bytes, consolidate_mode: Optional[int] = None
+    ) -> Dict[str, Any]:
         """
         Returns dict with keys:
             - status_code
@@ -36,72 +36,75 @@ class GrobidClient(object):
             grobid_response = requests.post(
                 self.host_url + "/api/processFulltextDocument",
                 files={
-                    'input': blob,
-                    'consolidateHeader': consolidate_mode,
-                    'consolidateCitations': 0,  # too expensive for now
-                    'includeRawCitations': 1,
+                    "input": blob,
+                    "consolidateHeader": consolidate_mode,
+                    "consolidateCitations": 0,  # too expensive for now
+                    "includeRawCitations": 1,
                 },
                 timeout=180.0,
             )
         except requests.Timeout:
             return {
-                'status': 'error-timeout',
-                'status_code': -4,  # heritrix3 "HTTP timeout" code
-                'error_msg': 'GROBID request (HTTP POST) timeout',
+                "status": "error-timeout",
+                "status_code": -4,  # heritrix3 "HTTP timeout" code
+                "error_msg": "GROBID request (HTTP POST) timeout",
             }
 
         info: Dict[str, Any] = dict(status_code=grobid_response.status_code)
         if grobid_response.status_code == 200:
-            info['status'] = 'success'
-            info['tei_xml'] = grobid_response.text
-            if len(info['tei_xml']) > 12000000:
+            info["status"] = "success"
+            info["tei_xml"] = grobid_response.text
+            if len(info["tei_xml"]) > 12000000:
                 # XML is larger than Kafka message size, and much larger than
                 # an article in general; bail out
-                info['status'] = 'error'
-                info['error_msg'] = "response XML too large: {} bytes".format(
-                    len(info['tei_xml']))
-                info.pop('tei_xml')
+                info["status"] = "error"
+                info["error_msg"] = "response XML too large: {} bytes".format(
+                    len(info["tei_xml"])
+                )
+                info.pop("tei_xml")
         else:
             # response.text is .content decoded as utf-8
-            info['status'] = 'error'
-            info['error_msg'] = grobid_response.text[:10000]
+            info["status"] = "error"
+            info["error_msg"] = grobid_response.text[:10000]
         return info
 
     def metadata(self, result: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-        if result['status'] != 'success':
+        if result["status"] != "success":
             return None
-        tei_json = teixml2json(result['tei_xml'], encumbered=False)
+        tei_json = teixml2json(result["tei_xml"], encumbered=False)
         meta = dict()
         biblio = dict()
         for k in (
-                'title',
-                'authors',
-                'journal',
-                'date',
-                'doi',
+            "title",
+            "authors",
+            "journal",
+            "date",
+            "doi",
         ):
             if tei_json.get(k):
                 biblio[k] = tei_json[k]
-        meta['biblio'] = biblio
-        for k in ('grobid_version', 'grobid_timestamp', 'fatcat_release', 'language_code'):
+        meta["biblio"] = biblio
+        for k in ("grobid_version", "grobid_timestamp", "fatcat_release", "language_code"):
             if tei_json.get(k):
                 meta[k] = tei_json[k]
         return meta
 
 
 class GrobidWorker(SandcrawlerFetchWorker):
-    def __init__(self,
-                 grobid_client: GrobidClient,
-                 wayback_client: Optional[WaybackClient] = None,
-                 sink: Optional[SandcrawlerWorker] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        grobid_client: GrobidClient,
+        wayback_client: Optional[WaybackClient] = None,
+        sink: Optional[SandcrawlerWorker] = None,
+        **kwargs
+    ):
         super().__init__(wayback_client=wayback_client)
         self.grobid_client = grobid_client
         self.sink = sink
         self.consolidate_mode = 0
 
     def timeout_response(self, task: Any) -> Any:
-        default_key = task['sha1hex']
+        default_key = task["sha1hex"]
         return dict(
             status="error-timeout",
             error_msg="internal GROBID worker timeout",
@@ -111,16 +114,17 @@ class GrobidWorker(SandcrawlerFetchWorker):
 
     def process(self, record: Any, key: Optional[str] = None) -> Any:
         fetch_result = self.fetch_blob(record)
-        if fetch_result['status'] != 'success':
+        if fetch_result["status"] != "success":
             return fetch_result
-        blob: bytes = fetch_result['blob']
+        blob: bytes = fetch_result["blob"]
         assert blob and isinstance(blob, bytes)
 
-        result = self.grobid_client.process_fulltext(blob,
-                                                     consolidate_mode=self.consolidate_mode)
-        result['file_meta'] = gen_file_metadata(blob)
-        result['source'] = record
-        result['key'] = result['file_meta']['sha1hex']
+        result = self.grobid_client.process_fulltext(
+            blob, consolidate_mode=self.consolidate_mode
+        )
+        result["file_meta"] = gen_file_metadata(blob)
+        result["source"] = record
+        result["key"] = result["file_meta"]["sha1hex"]
         return result
 
 
@@ -129,10 +133,10 @@ class GrobidBlobWorker(SandcrawlerWorker):
     This is sort of like GrobidWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-    def __init__(self,
-                 grobid_client: GrobidClient,
-                 sink: Optional[SandcrawlerWorker] = None,
-                 **kwargs):
+
+    def __init__(
+        self, grobid_client: GrobidClient, sink: Optional[SandcrawlerWorker] = None, **kwargs
+    ):
         super().__init__()
         self.grobid_client = grobid_client
         self.sink = sink
@@ -141,8 +145,9 @@ class GrobidBlobWorker(SandcrawlerWorker):
     def process(self, blob: Any, key: Optional[str] = None) -> Any:
         if not blob:
             return None
-        result = self.grobid_client.process_fulltext(blob,
-                                                     consolidate_mode=self.consolidate_mode)
-        result['file_meta'] = gen_file_metadata(blob)
-        result['key'] = result['file_meta']['sha1hex']
+        result = self.grobid_client.process_fulltext(
+            blob, consolidate_mode=self.consolidate_mode
+        )
+        result["file_meta"] = gen_file_metadata(blob)
+        result["key"] = result["file_meta"]["sha1hex"]
         return result