make fmt (black 21.9b0)

author: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-10-27 18:50:17 -0700
commit: 826c7538e091fac14d987a3cd654975da964e240 (patch)
tree: 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/pdftrio.py
parent: 020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download: sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
1 files changed, 42 insertions, 36 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 138e65c..d765164 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -32,37 +32,37 @@ class PdfTrioClient(object):
             pdftrio_response = requests.post(
                 self.host_url + "/classify/research-pub/" + mode,
                 files={
-                    'pdf_content': blob,
+                    "pdf_content": blob,
                 },
                 timeout=60.0,
             )
         except requests.Timeout:
             return {
-                'status': 'error-timeout',
-                'status_code': -4,  # heritrix3 "HTTP timeout" code
-                'error_msg': 'pdftrio request (HTTP POST) timeout',
+                "status": "error-timeout",
+                "status_code": -4,  # heritrix3 "HTTP timeout" code
+                "error_msg": "pdftrio request (HTTP POST) timeout",
             }
         except requests.exceptions.ConnectionError:
             # crude back-off
             time.sleep(2.0)
             return {
-                'status': 'error-connect',
-                'status_code': -2,  # heritrix3 "HTTP connect" code
-                'error_msg': 'pdftrio request connection timout',
+                "status": "error-connect",
+                "status_code": -2,  # heritrix3 "HTTP connect" code
+                "error_msg": "pdftrio request connection timout",
             }
 
         info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
         if pdftrio_response.status_code == 200:
             resp_json = pdftrio_response.json()
-            assert 'ensemble_score' in resp_json
-            assert 'status' in resp_json
-            assert 'versions' in resp_json
+            assert "ensemble_score" in resp_json
+            assert "status" in resp_json
+            assert "versions" in resp_json
             info.update(resp_json)
         else:
-            info['status'] = 'error'
+            info["status"] = "error"
             # TODO: might return JSON with some info?
 
-        info['_total_sec'] = pdftrio_response.elapsed.total_seconds()
+        info["_total_sec"] = pdftrio_response.elapsed.total_seconds()
         return info
 
 
@@ -70,11 +70,14 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
     """
     This class is basically copied directly from GrobidWorker
     """
-    def __init__(self,
-                 pdftrio_client: PdfTrioClient,
-                 wayback_client: Optional[WaybackClient] = None,
-                 sink: Optional[SandcrawlerWorker] = None,
-                 **kwargs):
+
+    def __init__(
+        self,
+        pdftrio_client: PdfTrioClient,
+        wayback_client: Optional[WaybackClient] = None,
+        sink: Optional[SandcrawlerWorker] = None,
+        **kwargs
+    ):
         super().__init__(wayback_client=wayback_client, **kwargs)
         self.pdftrio_client = pdftrio_client
         self.sink = sink
@@ -86,22 +89,22 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
         start = time.time()
         fetch_result = self.fetch_blob(record)
         fetch_sec = time.time() - start
-        if fetch_result['status'] != 'success':
+        if fetch_result["status"] != "success":
             return fetch_result
-        blob: bytes = fetch_result['blob']
+        blob: bytes = fetch_result["blob"]
         assert blob and isinstance(blob, bytes)
 
         result = dict()
-        result['file_meta'] = gen_file_metadata(blob)
-        result['key'] = result['file_meta']['sha1hex']
-        result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
-        result['source'] = record
-        result['timing'] = dict(
-            pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+        result["file_meta"] = gen_file_metadata(blob)
+        result["key"] = result["file_meta"]["sha1hex"]
+        result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob)
+        result["source"] = record
+        result["timing"] = dict(
+            pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
             total_sec=time.time() - start_process,
         )
         if fetch_sec:
-            result['timing']['fetch_sec'] = fetch_sec
+            result["timing"]["fetch_sec"] = fetch_sec
         return result
 
 
@@ -110,11 +113,14 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
     This is sort of like PdfTrioWorker, except it receives blobs directly,
     instead of fetching blobs from some remote store.
     """
-    def __init__(self,
-                 pdftrio_client: PdfTrioClient,
-                 sink: Optional[SandcrawlerWorker] = None,
-                 mode: str = "auto",
-                 **kwargs):
+
+    def __init__(
+        self,
+        pdftrio_client: PdfTrioClient,
+        sink: Optional[SandcrawlerWorker] = None,
+        mode: str = "auto",
+        **kwargs
+    ):
         super().__init__(**kwargs)
         self.pdftrio_client = pdftrio_client
         self.sink = sink
@@ -126,11 +132,11 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
             return None
         assert isinstance(blob, bytes)
         result = dict()
-        result['file_meta'] = gen_file_metadata(blob)
-        result['key'] = result['file_meta']['sha1hex']
-        result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
-        result['timing'] = dict(
-            pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+        result["file_meta"] = gen_file_metadata(blob)
+        result["key"] = result["file_meta"]["sha1hex"]
+        result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+        result["timing"] = dict(
+            pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
             total_sec=time.time() - start_process,
         )
         return result
author	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-10-27 18:50:17 -0700
commit	826c7538e091fac14d987a3cd654975da964e240 (patch)
tree	90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/pdftrio.py
parent	020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
download	sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip