summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/sandcrawler.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-19 10:29:43 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-19 10:29:43 -0700
commitc5faf30df8772af607689a3f65241f34b365325b (patch)
tree35bcc82bf7b08bce0317481949017123721cb804 /fatcat_scholar/sandcrawler.py
parentf435f4aa9e3f490f9b7baa0d7a00fa5a986bb31e (diff)
downloadfatcat-scholar-c5faf30df8772af607689a3f65241f34b365325b.tar.gz
fatcat-scholar-c5faf30df8772af607689a3f65241f34b365325b.zip
WIP on release-to-sim fetching
Diffstat (limited to 'fatcat_scholar/sandcrawler.py')
-rw-r--r--fatcat_scholar/sandcrawler.py75
1 files changed, 75 insertions, 0 deletions
diff --git a/fatcat_scholar/sandcrawler.py b/fatcat_scholar/sandcrawler.py
new file mode 100644
index 0000000..db6014f
--- /dev/null
+++ b/fatcat_scholar/sandcrawler.py
@@ -0,0 +1,75 @@
+
+import json
+import minio
+import requests
+from typing import Dict, Optional, Any
+
+class SandcrawlerPostgrestClient():
+
+ def __init__(self, api_url: str):
+ self.api_url = api_url
+
+ def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]:
+ resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex='eq.'+sha1))
+ resp.raise_for_status()
+ resp_json = resp.json()
+ if resp_json:
+ return resp_json[0]
+ else:
+ return None
+
+
+class SandcrawlerMinioClient(object):
+
+ def __init__(self, host_url: str, access_key: Optional[str] = None, secret_key: Optional[str] = None, default_bucket: Optional[str] = "sandcrawler"):
+ """
+ host is minio connection string (host:port)
+ access and secret key are as expected
+ default_bucket can be supplied so that it doesn't need to be repeated for each function call
+
+ Example config:
+
+ host="localhost:9000",
+ access_key=os.environ['MINIO_ACCESS_KEY'],
+ secret_key=os.environ['MINIO_SECRET_KEY'],
+ """
+ self.mc = minio.Minio(
+ host_url,
+ access_key=access_key,
+ secret_key=secret_key,
+ secure=False,
+ )
+ self.default_bucket = default_bucket
+
+ def _blob_path(self, folder, sha1hex, extension, prefix):
+ if not extension:
+ extension = ""
+ if not prefix:
+ prefix = ""
+ assert len(sha1hex) == 40
+ obj_path = "{}{}/{}/{}/{}{}".format(
+ prefix,
+ folder,
+ sha1hex[0:2],
+ sha1hex[2:4],
+ sha1hex,
+ extension,
+ )
+ return obj_path
+
+ def get_blob(self, folder, sha1hex, extension="", prefix="", bucket=None):
+ """
+ sha1hex is sha1 of the blob itself
+
+ Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
+ """
+ obj_path = self._blob_path(folder, sha1hex, extension, prefix)
+ if not bucket:
+ bucket = self.default_bucket
+ assert bucket
+ blob = self.mc.get_object(
+ bucket,
+ obj_path,
+ )
+ # TODO: optionally verify SHA-1?
+ return blob.data