1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
import json
import minio
import requests
from typing import Dict, Optional, Any
class SandcrawlerPostgrestClient():
def __init__(self, api_url: str):
self.api_url = api_url
def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]:
resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
resp.raise_for_status()
resp_json = resp.json()
if resp_json:
return resp_json[0]
else:
return None
class SandcrawlerMinioClient(object):
def __init__(self, host_url: str, access_key: Optional[str] = None, secret_key: Optional[str] = None, default_bucket: Optional[str] = "sandcrawler"):
"""
host is minio connection string (host:port)
access and secret key are as expected
default_bucket can be supplied so that it doesn't need to be repeated for each function call
Example config:
host="localhost:9000",
access_key=os.environ['MINIO_ACCESS_KEY'],
secret_key=os.environ['MINIO_SECRET_KEY'],
"""
self.mc = minio.Minio(
host_url,
access_key=access_key,
secret_key=secret_key,
secure=False,
)
self.default_bucket = default_bucket
def _blob_path(self, folder, sha1hex, extension, prefix):
if not extension:
extension = ""
if not prefix:
prefix = ""
assert len(sha1hex) == 40
obj_path = "{}{}/{}/{}/{}{}".format(
prefix,
folder,
sha1hex[0:2],
sha1hex[2:4],
sha1hex,
extension,
)
return obj_path
def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
"""
sha1hex is sha1 of the blob itself
Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
"""
obj_path = self._blob_path(folder, sha1hex, extension, prefix)
if not bucket:
bucket = self.default_bucket
assert bucket
blob = self.mc.get_object(
bucket,
obj_path,
)
# TODO: optionally verify SHA-1?
return blob.data
|