aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/pdftrio.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-27 18:50:17 -0700
commit826c7538e091fac14d987a3cd654975da964e240 (patch)
tree90345b4cabb461c624ca5a218c2fc01dce3055cd /python/sandcrawler/pdftrio.py
parent020037d4714e7ba2ab172c7278494aed0b2148ad (diff)
downloadsandcrawler-826c7538e091fac14d987a3cd654975da964e240.tar.gz
sandcrawler-826c7538e091fac14d987a3cd654975da964e240.zip
make fmt (black 21.9b0)
Diffstat (limited to 'python/sandcrawler/pdftrio.py')
-rw-r--r--python/sandcrawler/pdftrio.py78
1 files changed, 42 insertions, 36 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index 138e65c..d765164 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -32,37 +32,37 @@ class PdfTrioClient(object):
pdftrio_response = requests.post(
self.host_url + "/classify/research-pub/" + mode,
files={
- 'pdf_content': blob,
+ "pdf_content": blob,
},
timeout=60.0,
)
except requests.Timeout:
return {
- 'status': 'error-timeout',
- 'status_code': -4, # heritrix3 "HTTP timeout" code
- 'error_msg': 'pdftrio request (HTTP POST) timeout',
+ "status": "error-timeout",
+ "status_code": -4, # heritrix3 "HTTP timeout" code
+ "error_msg": "pdftrio request (HTTP POST) timeout",
}
except requests.exceptions.ConnectionError:
# crude back-off
time.sleep(2.0)
return {
- 'status': 'error-connect',
- 'status_code': -2, # heritrix3 "HTTP connect" code
- 'error_msg': 'pdftrio request connection timout',
+ "status": "error-connect",
+ "status_code": -2, # heritrix3 "HTTP connect" code
+ "error_msg": "pdftrio request connection timout",
}
info: Dict[str, Any] = dict(status_code=pdftrio_response.status_code)
if pdftrio_response.status_code == 200:
resp_json = pdftrio_response.json()
- assert 'ensemble_score' in resp_json
- assert 'status' in resp_json
- assert 'versions' in resp_json
+ assert "ensemble_score" in resp_json
+ assert "status" in resp_json
+ assert "versions" in resp_json
info.update(resp_json)
else:
- info['status'] = 'error'
+ info["status"] = "error"
# TODO: might return JSON with some info?
- info['_total_sec'] = pdftrio_response.elapsed.total_seconds()
+ info["_total_sec"] = pdftrio_response.elapsed.total_seconds()
return info
@@ -70,11 +70,14 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
"""
This class is basically copied directly from GrobidWorker
"""
- def __init__(self,
- pdftrio_client: PdfTrioClient,
- wayback_client: Optional[WaybackClient] = None,
- sink: Optional[SandcrawlerWorker] = None,
- **kwargs):
+
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs
+ ):
super().__init__(wayback_client=wayback_client, **kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
@@ -86,22 +89,22 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
start = time.time()
fetch_result = self.fetch_blob(record)
fetch_sec = time.time() - start
- if fetch_result['status'] != 'success':
+ if fetch_result["status"] != "success":
return fetch_result
- blob: bytes = fetch_result['blob']
+ blob: bytes = fetch_result["blob"]
assert blob and isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob)
- result['source'] = record
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob)
+ result["source"] = record
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
if fetch_sec:
- result['timing']['fetch_sec'] = fetch_sec
+ result["timing"]["fetch_sec"] = fetch_sec
return result
@@ -110,11 +113,14 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
This is sort of like PdfTrioWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self,
- pdftrio_client: PdfTrioClient,
- sink: Optional[SandcrawlerWorker] = None,
- mode: str = "auto",
- **kwargs):
+
+ def __init__(
+ self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs
+ ):
super().__init__(**kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
@@ -126,11 +132,11 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
return None
assert isinstance(blob, bytes)
result = dict()
- result['file_meta'] = gen_file_metadata(blob)
- result['key'] = result['file_meta']['sha1hex']
- result['pdf_trio'] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
- result['timing'] = dict(
- pdftrio_sec=result['pdf_trio'].pop('_total_sec', None),
+ result["file_meta"] = gen_file_metadata(blob)
+ result["key"] = result["file_meta"]["sha1hex"]
+ result["pdf_trio"] = self.pdftrio_client.classify_pdf(blob, mode=self.mode)
+ result["timing"] = dict(
+ pdftrio_sec=result["pdf_trio"].pop("_total_sec", None),
total_sec=time.time() - start_process,
)
return result