diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/sandcrawler/pdftrio.py | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/sandcrawler/pdftrio.py')
-rw-r--r-- | python/sandcrawler/pdftrio.py | 10 |
1 files changed, 2 insertions, 8 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py index 7d03357..e3d4a54 100644 --- a/python/sandcrawler/pdftrio.py +++ b/python/sandcrawler/pdftrio.py @@ -1,4 +1,3 @@ - import time import requests @@ -8,7 +7,6 @@ from .workers import SandcrawlerFetchWorker, SandcrawlerWorker class PdfTrioClient(object): - def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs): self.host_url = host_url self.http_session = requests_retry_session(retries=3, backoff_factor=3) @@ -51,9 +49,7 @@ class PdfTrioClient(object): 'error_msg': 'pdftrio request connection timout', } - info = dict( - status_code=pdftrio_response.status_code, - ) + info = dict(status_code=pdftrio_response.status_code, ) if pdftrio_response.status_code == 200: resp_json = pdftrio_response.json() assert 'ensemble_score' in resp_json @@ -72,7 +68,6 @@ class PdfTrioWorker(SandcrawlerFetchWorker): """ This class is basically copied directly from GrobidWorker """ - def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs): super().__init__(wayback_client=wayback_client) self.pdftrio_client = pdftrio_client @@ -103,12 +98,12 @@ class PdfTrioWorker(SandcrawlerFetchWorker): result['timing']['fetch_sec'] = fetch_sec return result + class PdfTrioBlobWorker(SandcrawlerWorker): """ This is sort of like PdfTrioWorker, except it receives blobs directly, instead of fetching blobs from some remote store. """ - def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs): super().__init__() self.pdftrio_client = pdftrio_client @@ -128,4 +123,3 @@ class PdfTrioBlobWorker(SandcrawlerWorker): total_sec=time.time() - start_process, ) return result - |