diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-19 10:29:43 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-19 10:29:43 -0700 |
commit | c5faf30df8772af607689a3f65241f34b365325b (patch) | |
tree | 35bcc82bf7b08bce0317481949017123721cb804 /fatcat_scholar/sandcrawler.py | |
parent | f435f4aa9e3f490f9b7baa0d7a00fa5a986bb31e (diff) | |
download | fatcat-scholar-c5faf30df8772af607689a3f65241f34b365325b.tar.gz fatcat-scholar-c5faf30df8772af607689a3f65241f34b365325b.zip |
WIP on release-to-sim fetching
Diffstat (limited to 'fatcat_scholar/sandcrawler.py')
-rw-r--r-- | fatcat_scholar/sandcrawler.py | 75 |
1 files changed, 75 insertions, 0 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py new file mode 100644 index 0000000..db6014f --- /dev/null +++ b/fatcat_scholar/sandcrawler.py @@ -0,0 +1,75 @@ + +import json +import minio +import requests +from typing import Dict, Optional, Any + +class SandcrawlerPostgrestClient(): + + def __init__(self, api_url: str): + self.api_url = api_url + + def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]: + resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1)) + resp.raise_for_status() + resp_json = resp.json() + if resp_json: + return resp_json[0] + else: + return None + + +class SandcrawlerMinioClient(object): + + def __init__(self, host_url: str, access_key: Optional[str] = None, secret_key: Optional[str] = None, default_bucket: Optional[str] = "sandcrawler"): + """ + host is minio connection string (host:port) + access and secret key are as expected + default_bucket can be supplied so that it doesn't need to be repeated for each function call + + Example config: + + host="localhost:9000", + access_key=os.environ['MINIO_ACCESS_KEY'], + secret_key=os.environ['MINIO_SECRET_KEY'], + """ + self.mc = minio.Minio( + host_url, + access_key=access_key, + secret_key=secret_key, + secure=False, + ) + self.default_bucket = default_bucket + + def _blob_path(self, folder, sha1hex, extension, prefix): + if not extension: + extension = "" + if not prefix: + prefix = "" + assert len(sha1hex) == 40 + obj_path = "{}{}/{}/{}/{}{}".format( + prefix, + folder, + sha1hex[0:2], + sha1hex[2:4], + sha1hex, + extension, + ) + return obj_path + + def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None): + """ + sha1hex is sha1 of the blob itself + + Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention + """ + obj_path = self._blob_path(folder, sha1hex, extension, prefix) + if not bucket: + bucket = self.default_bucket + assert bucket + blob = self.mc.get_object( + bucket, + obj_path, + ) + # TODO: optionally verify SHA-1? + return blob.data |