aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/minio.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-26 21:16:59 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commit293d4b176855d400324559c814abd2e404cdf31e (patch)
treee88b72109cc209fe422264deabdd1e7ca937add7 /python/sandcrawler/minio.py
parent9fda5323046cb3f87f0c7c7e07eca283ca52ce99 (diff)
downloadsandcrawler-293d4b176855d400324559c814abd2e404cdf31e.tar.gz
sandcrawler-293d4b176855d400324559c814abd2e404cdf31e.zip
flush out minio helper, add to grobid persist
Diffstat (limited to 'python/sandcrawler/minio.py')
-rw-r--r--python/sandcrawler/minio.py55
1 files changed, 42 insertions, 13 deletions
diff --git a/python/sandcrawler/minio.py b/python/sandcrawler/minio.py
index e6ebe41..39903e0 100644
--- a/python/sandcrawler/minio.py
+++ b/python/sandcrawler/minio.py
@@ -1,11 +1,13 @@
+import io
import os
+
import minio
class SandcrawlerMinioClient(object):
- def __init__(self, host, access_key, secret_key, default_bucket=None):
+ def __init__(self, host_url, access_key, secret_key, default_bucket=None):
"""
host is minio connection string (host:port)
access and secret key are as expected
@@ -18,14 +20,30 @@ class SandcrawlerMinioClient(object):
secret_key=os.environ['MINIO_SECRET_KEY'],
"""
self.mc = minio.Minio(
- host,
+ host_url,
access_key=access_key,
secret_key=secret_key,
secure=False,
)
self.default_bucket = default_bucket
- def upload_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
+ def _blob_path(self, folder, sha1hex, extension, prefix):
+ if not extension:
+ extension = ""
+ if not prefix:
+ prefix = ""
+ assert len(sha1hex) == 40
+ obj_path = "{}{}/{}/{}/{}{}".format(
+ prefix,
+ folder,
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
"""
blob should be bytes
sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
@@ -40,20 +58,31 @@ class SandcrawlerMinioClient(object):
h = hashlib.sha1()
h.update(blob)
sha1hex = h.hexdigest()
- obj_path = "{}{}/{}/{}/{}{}".format(
- prefix,
- folder,
- sha1hex[0:2],
- sha1hex[2:4],
- sha1hex,
- extension,
- )
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
if not bucket:
bucket = self.default_bucket
+ assert bucket
self.mc.put_object(
- self.default_bucket,
+ bucket,
obj_path,
- blob,
+ io.BytesIO(blob),
len(blob),
)
return (bucket, obj_path)
+
+ def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ """
+ sha1hex is sha1 of the blob itself
+
+ Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
+ """
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
+ if not bucket:
+ bucket = self.default_bucket
+ assert bucket
+ blob = self.mc.get_object(
+ bucket,
+ obj_path,
+ )
+ # TODO: optionally verify SHA-1?
+ return blob