diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:50:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-27 18:50:17 -0700 |
commit | 826c7538e091fac14d987a3cd654975da964e240 (patch) | |
tree | 90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/pdftrio.py | |
parent | 020037d4714e7ba2ab172c7278494aed0b2148ad (diff) | |
download | sandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip |
make fmt (black 21.9b0)
Diffstat (limited to 'python/sandcrawler/pdftrio.py')
-rw-r--r-- | python/sandcrawler/pdftrio.py | 78 |
1 files changed, 42 insertions, 36 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 138e65c..d765164 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -32,37 +32,37 @@ class PdfTrioClient(object): pdftrio_response = requests.post( self.host_url + "/classify/research-pub/" + mode, files={ - 'pdf_content': blob, + "pdf_content": blob, }, timeout=60.0, ) except requests.Timeout: return { - 'status': 'error-timeout', - 'status_code': -4, # heritrix3 "HTTP timeout" code - 'error_msg': 'pdftrio request (HTTP POST) timeout', + "status": "error-timeout", + "status_code": -4, # heritrix3 "HTTP timeout" code + "error_msg": "pdftrio request (HTTP POST) timeout", } except requests.exceptions.ConnectionError: # crude back-off time.sleep(2.0) return { - 'status': 'error-connect', - 'status_code': -2, # heritrix3 "HTTP connect" code - 'error_msg': 'pdftrio request connection timout', + "status": "error-connect", + "status_code": -2, # heritrix3 "HTTP connect" code + "error_msg": "pdftrio request connection timout", } info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code) if pdftrio_response.status_code == 200: resp_json = pdftrio_response.json() - assert 'ensemble_score' in resp_json - assert 'status' in resp_json - assert 'versions' in resp_json + assert "ensemble_score" in resp_json + assert "status" in resp_json + assert "versions" in resp_json info.update(resp_json) else: - info['status'] = 'error' + info["status"] = "error" # TODO: might return JSON with some info? - info['_total_sec'] = pdftrio_response.elapsed.total_seconds() + info["_total_sec"] = pdftrio_response.elapsed.total_seconds() return info @@ -70,11 +70,14 @@ class PdfTrioWorker(SandcrawlerFetchWorker): """ This class is basically copied directly from GrobidWorker """ - def __init__(self, - pdftrio_client: PdfTrioClient, - wayback_client: Optional[WaybackClient] = None, - sink: Optional[SandcrawlerWorker] = None, - **kwargs): + + def __init__( + self, + pdftrio_client: PdfTrioClient, + wayback_client: Optional[WaybackClient] = None, + sink: Optional[SandcrawlerWorker] = None, + **kwargs + ): super().__init__(wayback_client=wayback_client, **kwargs) self.pdftrio_client = pdftrio_client self.sink = sink @@ -86,22 +89,22 @@ class PdfTrioWorker(SandcrawlerFetchWorker): start = time.time() fetch_result = self.fetch_blob(record) fetch_sec = time.time() - start - if fetch_result['status'] != 'success': + if fetch_result["status"] != "success": return fetch_result - blob: bytes = fetch_result['blob'] + blob: bytes = fetch_result["blob"] assert blob and isinstance(blob, bytes) result = dict() - result['file_meta'] = gen_file_metadata(blob) - result['key'] = result['file_meta']['sha1hex'] - result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob) - result['source'] = record - result['timing'] = dict( - pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), + result["file_meta"] = gen_file_metadata(blob) + result["key"] = result["file_meta"]["sha1hex"] + result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob) + result["source"] = record + result["timing"] = dict( + pdftrio_sec=result["pdf_trio"].pop("_total_sec", None), total_sec=time.time() - start_process, ) if fetch_sec: - result['timing']['fetch_sec'] = fetch_sec + result["timing"]["fetch_sec"] = fetch_sec return result @@ -110,11 +113,14 @@ class PdfTrioBlobWorker(SandcrawlerWorker): This is sort of like PdfTrioWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ - def __init__(self, - pdftrio_client: PdfTrioClient, - sink: Optional[SandcrawlerWorker] = None, - mode: str = "auto", - **kwargs): + + def __init__( + self, + pdftrio_client: PdfTrioClient, + sink: Optional[SandcrawlerWorker] = None, + mode: str = "auto", + **kwargs + ): super().__init__(**kwargs) self.pdftrio_client = pdftrio_client self.sink = sink @@ -126,11 +132,11 @@ class PdfTrioBlobWorker(SandcrawlerWorker): return None assert isinstance(blob, bytes) result = dict() - result['file_meta'] = gen_file_metadata(blob) - result['key'] = result['file_meta']['sha1hex'] - result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode) - result['timing'] = dict( - pdftrio_sec=result['pdf_trio'].pop('_total_sec', None), + result["file_meta"] = gen_file_metadata(blob) + result["key"] = result["file_meta"]["sha1hex"] + result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode) + result["timing"] = dict( + pdftrio_sec=result["pdf_trio"].pop("_total_sec", None), total_sec=time.time() - start_process, ) return result |