aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/minio.py
diff options
context:
space:
mode:
Diffstat (limited to 'python/sandcrawler/minio.py')
-rw-r--r--python/sandcrawler/minio.py45
1 files changed, 32 insertions, 13 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index c7deea1..8836515 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,14 +1,18 @@
-
-import io
-import os
import hashlib
+import io
+from typing import Optional, Tuple, Union
import minio
class SandcrawlerMinioClient(object):
-
- def __init__(self, host_url, access_key, secret_key, default_bucket=None):
+ def __init__(
+ self,
+ host_url: str,
+ access_key: str,
+ secret_key: str,
+ default_bucket: Optional[str] = None,
+ ):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -28,7 +32,7 @@ class SandcrawlerMinioClient(object):
)
self.default_bucket = default_bucket
- def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
+ def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
if not extension:
extension = ""
if not prefix:
@@ -44,7 +48,15 @@ class SandcrawlerMinioClient(object):
)
return obj_path
- def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ def put_blob(
+ self,
+ folder: str,
+ blob: Union[str, bytes],
+ sha1hex: Optional[str] = None,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> Tuple[str, str]:
"""
blob should be bytes
sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -53,7 +65,7 @@ class SandcrawlerMinioClient(object):
filename is SHA1 with an optional file extension.
"""
if type(blob) == str:
- blob = blob.encode('utf-8')
+ blob = blob.encode("utf-8")
assert type(blob) == bytes
if not sha1hex:
h = hashlib.sha1()
@@ -64,13 +76,13 @@ class SandcrawlerMinioClient(object):
bucket = self.default_bucket
assert bucket
content_type = "application/octet-stream"
- if extension.endswith('.xml'):
+ if extension.endswith(".xml"):
content_type = "application/xml"
- if extension.endswith('.png'):
+ if extension.endswith(".png"):
content_type = "image/png"
- elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
+ elif extension.endswith(".jpg") or extension.endswith(".jpeg"):
content_type = "image/jpeg"
- elif extension.endswith('.txt'):
+ elif extension.endswith(".txt"):
content_type = "text/plain"
self.mc.put_object(
bucket,
@@ -81,7 +93,14 @@ class SandcrawlerMinioClient(object):
)
return (bucket, obj_path)
- def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ def get_blob(
+ self,
+ folder: str,
+ sha1hex: str,
+ extension: str = "",
+ prefix: str = "",
+ bucket: Optional[str] = None,
+ ) -> bytes:
"""
sha1hex is sha1 of the blob itself