summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/sandcrawler.py
blob: 9b016f5ada2c11cf7147fcd8067ee12fe4310b3a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from typing import Any, Dict, Optional

import minio
import requests


class SandcrawlerPostgrestClient:
    def __init__(self, api_url: str):
        self.api_url = api_url

    def get_grobid(self, sha1: str) -> Optional[Dict[str, Any]]:
        resp = requests.get(self.api_url + "/grobid", params=dict(sha1hex="eq." + sha1))
        resp.raise_for_status()
        resp_json = resp.json()
        if resp_json:
            return resp_json[0]
        else:
            return None

    def get_pdf_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
        resp = requests.get(
            self.api_url + "/pdf_meta", params=dict(sha1hex="eq." + sha1)
        )
        resp.raise_for_status()
        resp_json = resp.json()
        if resp_json:
            return resp_json[0]
        else:
            return None

    def get_html_meta(self, sha1: str) -> Optional[Dict[str, Any]]:
        resp = requests.get(
            self.api_url + "/html_meta", params=dict(sha1hex="eq." + sha1)
        )
        resp.raise_for_status()
        resp_json = resp.json()
        if resp_json:
            return resp_json[0]
        else:
            return None

    def get_crossref(self, doi: str) -> Optional[Dict[str, Any]]:
        resp = requests.get(
            self.api_url + "/crossref_with_refs", params=dict(doi="eq." + doi)
        )
        resp.raise_for_status()
        resp_json = resp.json()
        if resp_json:
            return resp_json[0]
        else:
            return None


class SandcrawlerMinioClient:
    def __init__(
        self,
        host_url: str,
        access_key: Optional[str] = None,
        secret_key: Optional[str] = None,
        default_bucket: Optional[str] = "sandcrawler",
    ):
        """
        host is minio connection string (host:port)
        access and secret key are as expected
        default_bucket can be supplied so that it doesn't need to be repeated for each function call

        Example config:

            host="localhost:9000",
            access_key=os.environ['MINIO_ACCESS_KEY'],
            secret_key=os.environ['MINIO_SECRET_KEY'],
        """
        self.mc = minio.Minio(
            host_url,
            access_key=access_key,
            secret_key=secret_key,
            secure=False,
        )
        self.default_bucket = default_bucket

    def _blob_path(self, folder: str, sha1hex: str, extension: str, prefix: str) -> str:
        if not extension:
            extension = ""
        if not prefix:
            prefix = ""
        assert len(sha1hex) == 40
        obj_path = "{}{}/{}/{}/{}{}".format(
            prefix,
            folder,
            sha1hex[0:2],
            sha1hex[2:4],
            sha1hex,
            extension,
        )
        return obj_path

    def get_blob(
        self,
        folder: str,
        sha1hex: str,
        extension: str = "",
        prefix: str = "",
        bucket: Optional[str] = None,
    ) -> bytes:
        """
        sha1hex is sha1 of the blob itself

        Fetched blob from the given bucket/folder, using the sandcrawler SHA1 path convention
        """
        obj_path = self._blob_path(folder, sha1hex, extension, prefix)
        if not bucket:
            bucket = self.default_bucket
        assert bucket
        blob = self.mc.get_object(
            bucket,
            obj_path,
        )
        # TODO: optionally verify SHA-1?
        return blob.data