diff options
Diffstat (limited to 'python/sandcrawler/minio.py')
-rw-r--r-- | python/sandcrawler/minio.py | 45 |
1 files changed, 32 insertions, 13 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py index c7deea1..8836515 100644 --- a/python/sandcrawler/minio.py +++ b/python/sandcrawler/minio.py @@ -1,14 +1,18 @@ - -import io -import os import hashlib +import io +from typing import Optional, Tuple, Union import minio class SandcrawlerMinioClient(object): - - def __init__(self, host_url, access_key, secret_key, default_bucket=None): + def __init__( + self, + host_url: str, + access_key: str, + secret_key: str, + default_bucket: Optional[str] = None, + ): """ host is minio connection string (host:port) access and secret key are as expected @@ -28,7 +32,7 @@ class SandcrawlerMinioClient(object): ) self.default_bucket = default_bucket - def _blob_path(self, folder, sha1hex: str, extension: str, prefix): + def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str: if not extension: extension = "" if not prefix: @@ -44,7 +48,15 @@ class SandcrawlerMinioClient(object): ) return obj_path - def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None): + def put_blob( + self, + folder: str, + blob: Union[str, bytes], + sha1hex: Optional[str] = None, + extension: str = "", + prefix: str = "", + bucket: Optional[str] = None, + ) -> Tuple[str, str]: """ blob should be bytes sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated @@ -53,7 +65,7 @@ class SandcrawlerMinioClient(object): filename is SHA1 with an optional file extension. """ if type(blob) == str: - blob = blob.encode('utf-8') + blob = blob.encode("utf-8") assert type(blob) == bytes if not sha1hex: h = hashlib.sha1() @@ -64,13 +76,13 @@ class SandcrawlerMinioClient(object): bucket = self.default_bucket assert bucket content_type = "application/octet-stream" - if extension.endswith('.xml'): + if extension.endswith(".xml"): content_type = "application/xml" - if extension.endswith('.png'): + if extension.endswith(".png"): content_type = "image/png" - elif extension.endswith('.jpg') or extension.endswith('.jpeg'): + elif extension.endswith(".jpg") or extension.endswith(".jpeg"): content_type = "image/jpeg" - elif extension.endswith('.txt'): + elif extension.endswith(".txt"): content_type = "text/plain" self.mc.put_object( bucket, @@ -81,7 +93,14 @@ class SandcrawlerMinioClient(object): ) return (bucket, obj_path) - def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None): + def get_blob( + self, + folder: str, + sha1hex: str, + extension: str = "", + prefix: str = "", + bucket: Optional[str] = None, + ) -> bytes: """ sha1hex is sha1 of the blob itself |