aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/minio.py
blob: 8b022113c2f4dfa7eca2bba90d8af50e7db1bf3e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99

import io
import os
import hashlib

import minio


class SandcrawlerMinioClient(object):

    def __init__(self, host_url, access_key, secret_key, default_bucket=None):
        """
        host is minio connection string (host:port)
        access and secret key are as expected
        default_bucket can be supplied so that it doesn't need to be repeated for each function call

        Example config:

            host="localhost:9000",
            access_key=os.environ['MINIO_ACCESS_KEY'],
            secret_key=os.environ['MINIO_SECRET_KEY'],
        """
        self.mc = minio.Minio(
            host_url,
            access_key=access_key,
            secret_key=secret_key,
            secure=False,
        )
        self.default_bucket = default_bucket

    def _blob_path(self, folder, sha1hex: str, extension: str, prefix):
        if not extension:
            extension = ""
        if not prefix:
            prefix = ""
        assert len(sha1hex) == 40
        obj_path = "{}{}/{}/{}/{}{}".format(
            prefix,
            folder,
            sha1hex[0:2],
            sha1hex[2:4],
            sha1hex,
            extension,
        )
        return obj_path

    def put_blob(self, folder, blob, sha1hex=None, extension="", prefix="", bucket=None):
        """
        blob should be bytes
        sha1hex is assumed to be sha1 of the blob itself; if not supplied it will be calculated
        Uploads blob to path in the given bucket. Files are stored in a top-level
        folder, then in two levels of sub-directory based on sha1, then the
        filename is SHA1 with an optional file extension.
        """
        if type(blob) == str:
            blob = blob.encode('utf-8')
        assert type(blob) == bytes
        if not sha1hex:
            h = hashlib.sha1()
            h.update(blob)
            sha1hex = h.hexdigest()
        obj_path = self._blob_path(folder, sha1hex, extension, prefix)
        if not bucket:
            bucket = self.default_bucket
        assert bucket
        content_type = "application/octet-stream"
        if extension.endswith('.xml'):
            content_type = "application/xml"
        if extension.endswith('.png'):
            content_type = "image/png"
        elif extension.endswith('.jpg') or extension.endswith('.jpeg'):
            content_type = "image/jpeg"
        elif extension.endswith('.txt'):
            content_type = "text/plain"
        self.mc.put_object(
            bucket,
            obj_path,
            io.BytesIO(blob),
            len(blob),
            content_type=content_type,
        )
        return (bucket, obj_path)

    def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
        """
        sha1hex is sha1 of the blob itself

        Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
        """
        obj_path = self._blob_path(folder, sha1hex, extension, prefix)
        if not bucket:
            bucket = self.default_bucket
        assert bucket
        blob = self.mc.get_object(
            bucket,
            obj_path,
        )
        # TODO: optionally verify SHA-1?
        return blob