aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/pdftrio.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 16:59:32 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 16:59:32 -0700
commit4a46f166f8514b5620d2bcb13a5c5f3e6cee66c8 (patch)
treea15635b11ca66d5cdbbc1c3f6eaa73fd5fe35801 /python/sandcrawler/pdftrio.py
parentf08bbeb7981fd692ffc9277d15d282883a408051 (diff)
downloadsandcrawler-4a46f166f8514b5620d2bcb13a5c5f3e6cee66c8.tar.gz
sandcrawler-4a46f166f8514b5620d2bcb13a5c5f3e6cee66c8.zip
more progress on type annotations and linting
Diffstat (limited to 'python/sandcrawler/pdftrio.py')
-rw-r--r--python/sandcrawler/pdftrio.py29
1 files changed, 20 insertions, 9 deletions
diff --git a/python/sandcrawler/pdftrio.py b/python/sandcrawler/pdftrio.py
index ba875cd..7b18367 100644
--- a/python/sandcrawler/pdftrio.py
+++ b/python/sandcrawler/pdftrio.py
@@ -1,17 +1,19 @@
import time
+from typing import Any, Dict, Optional
import requests
+from .ia import WaybackClient
from .misc import gen_file_metadata, requests_retry_session
from .workers import SandcrawlerFetchWorker, SandcrawlerWorker
class PdfTrioClient(object):
- def __init__(self, host_url="http://pdftrio.qa.fatcat.wiki", **kwargs):
+ def __init__(self, host_url: str = "http://pdftrio.qa.fatcat.wiki", **kwargs):
self.host_url = host_url
self.http_session = requests_retry_session(retries=3, backoff_factor=3)
- def classify_pdf(self, blob, mode="auto"):
+ def classify_pdf(self, blob: bytes, mode: str = "auto") -> Dict[str, Any]:
"""
Returns a dict with at least:
@@ -24,7 +26,7 @@ class PdfTrioClient(object):
appropriately; an optional `error_msg` may also be set. For some other
errors, like connection failure, an exception is raised.
"""
- assert blob
+ assert blob and type(blob) == bytes
try:
pdftrio_response = requests.post(
@@ -68,12 +70,16 @@ class PdfTrioWorker(SandcrawlerFetchWorker):
"""
This class is basically copied directly from GrobidWorker
"""
- def __init__(self, pdftrio_client, wayback_client=None, sink=None, **kwargs):
- super().__init__(wayback_client=wayback_client)
+ def __init__(self,
+ pdftrio_client: PdfTrioClient,
+ wayback_client: Optional[WaybackClient] = None,
+ sink: Optional[SandcrawlerWorker] = None,
+ **kwargs):
+ super().__init__(wayback_client=wayback_client, **kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
- def process(self, record, key=None):
+ def process(self, record: Any, key: str = None) -> Any:
start_process = time.time()
fetch_sec = None
@@ -103,16 +109,21 @@ class PdfTrioBlobWorker(SandcrawlerWorker):
This is sort of like PdfTrioWorker, except it receives blobs directly,
instead of fetching blobs from some remote store.
"""
- def __init__(self, pdftrio_client, sink=None, mode="auto", **kwargs):
- super().__init__()
+ def __init__(self,
+ pdftrio_client: PdfTrioClient,
+ sink: Optional[SandcrawlerWorker] = None,
+ mode: str = "auto",
+ **kwargs):
+ super().__init__(**kwargs)
self.pdftrio_client = pdftrio_client
self.sink = sink
self.mode = mode
- def process(self, blob, key=None):
+ def process(self, blob: Any, key: str = None) -> Any:
start_process = time.time()
if not blob:
return None
+ assert isinstance(blob, bytes)
result = dict()
result['file_meta'] = gen_file_metadata(blob)
result['key'] = result['file_meta']['sha1hex']