aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/web_hacks.py
blob: aa33cbb583ccd8613ebd93fdc737f4401f2be242 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
import typing

import babel.numbers
import babel.support
import jinja2
from starlette.background import BackgroundTask
from starlette.templating import _TemplateResponse

from fatcat_scholar.config import I18N_LANG_OPTIONS, settings


class Jinja2Templates:
    """
    This is a patched version of starlette.templating.Jinja2Templates that
    supports extensions (list of strings) passed to jinja2.Environment
    """

    def __init__(self, directory: str, extensions: typing.List[str] = []) -> None:
        assert jinja2 is not None, "jinja2 must be installed to use Jinja2Templates"
        self.env = self.get_env(directory, extensions)

    def get_env(
        self, directory: str, extensions: typing.List[str] = []
    ) -> "jinja2.Environment":
        @jinja2.pass_context
        def url_for(context: dict, name: str, **path_params: typing.Any) -> str:
            request = context["request"]
            return request.url_for(name, **path_params)

        loader = jinja2.FileSystemLoader(directory)
        env = jinja2.Environment(loader=loader, extensions=extensions, autoescape=True)
        env.globals["url_for"] = url_for
        return env

    def get_template(self, name: str) -> "jinja2.Template":
        return self.env.get_template(name)

    def TemplateResponse(
        self,
        name: str,
        context: dict,
        status_code: int = 200,
        headers: dict = None,
        media_type: str = None,
        background: BackgroundTask = None,
    ) -> _TemplateResponse:
        if "request" not in context:
            raise ValueError('context must include a "request" key')
        template = self.get_template(name)
        return _TemplateResponse(
            template,
            context,
            status_code=status_code,
            headers=headers,
            media_type=media_type,
            background=background,
        )


def load_i18n_files() -> typing.Any:
    """
    This is a hack to work around lack of per-request translation
    (babel/gettext) locale switching in FastAPI and Starlette. Flask (and
    presumably others) get around this using global context (eg, in
    Flask-Babel).

    See related issues:

    - https://github.com/encode/starlette/issues/279
    - https://github.com/aio-libs/aiohttp-jinja2/issues/187
    """

    d = dict()
    for lang_opt in I18N_LANG_OPTIONS:
        translations = babel.support.Translations.load(
            dirname="fatcat_scholar/translations",
            locales=[lang_opt],
        )
        d[lang_opt] = translations
    return d


I18N_TRANSLATION_FILES = load_i18n_files()


def locale_gettext(translations: typing.Any) -> typing.Any:
    def gt(s):  # noqa: ANN001,ANN201
        return translations.ugettext(s)

    return gt


def locale_ngettext(translations: typing.Any) -> typing.Any:
    def ngt(s, p, n):  # noqa: ANN001,ANN201
        return translations.ungettext(s, p, n)

    return ngt


def i18n_templates(locale: str) -> Jinja2Templates:
    """
    This is a hack to work around lack of per-request translation
    (babel/gettext) locale switching in FastAPI and Starlette. Flask (and
    presumably others) get around this using global context (eg, in
    Flask-Babel).

    The intent is to call this function and create a new Jinja2 Environment for
    a specific language separately within a request (aka, not shared between
    requests), when needed. This is inefficient but should resolve issues with
    cross-request poisoning, both in threading (threadpool) or async
    concurrency.

    See related issues:

    - https://github.com/encode/starlette/issues/279
    - https://github.com/aio-libs/aiohttp-jinja2/issues/187
    """

    translations = I18N_TRANSLATION_FILES[locale]
    templates = Jinja2Templates(
        directory="fatcat_scholar/templates",
        extensions=["jinja2.ext.i18n", "jinja2.ext.do"],
    )
    templates.env.install_gettext_translations(translations, newstyle=True)  # type: ignore
    templates.env.install_gettext_callables(  # type: ignore
        locale_gettext(translations),
        locale_ngettext(translations),
        newstyle=True,
    )
    # remove a lot of whitespace in HTML output with these configs
    templates.env.trim_blocks = True
    templates.env.lstrip_blocks = True
    # pass-through application settings to be available in templates
    templates.env.globals["settings"] = settings
    templates.env.globals["babel_numbers"] = babel.numbers
    templates.env.globals["make_access_redirect_url"] = make_access_redirect_url
    return templates


def parse_accept_lang(header: str, options: typing.List[str]) -> typing.Optional[str]:
    """
    Crude HTTP Accept-Language content negotiation.
    Assumes that languages are specified in order of priority, etc.
    """
    if not header:
        return None
    chunks = [v.split(";")[0].split("-")[0].split("_")[0] for v in header.split(",")]
    for c in chunks:
        if len(c) == 2 and c in options:
            return c
    return None


def test_parse_accept_lang() -> None:
    assert parse_accept_lang("", []) is None
    assert parse_accept_lang("en,de", []) is None
    assert parse_accept_lang("en,de", ["en"]) == "en"
    assert parse_accept_lang("en-GB,de", ["en"]) == "en"
    assert parse_accept_lang("zh_Hans_CN", ["en", "zh"]) == "zh"
    assert parse_accept_lang("en,de", ["de"]) == "de"
    assert (
        parse_accept_lang("en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["de"])
        == "de"
    )
    assert (
        parse_accept_lang(
            "en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["en", "de"]
        )
        == "en"
    )
    assert (
        parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"])
        == "en"
    )


def wayback_direct_url(url: str) -> str:
    """
    Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
    """
    if "://web.archive.org" not in url:
        return url
    segments = url.split("/")
    if len(segments) < 6 or not segments[4].isdigit():
        return url
    segments[4] += "id_"
    return "/".join(segments)


def test_wayback_direct_url() -> None:
    assert (
        wayback_direct_url("http://fatcat.wiki/thing.pdf")
        == "http://fatcat.wiki/thing.pdf"
    )
    assert (
        wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
        == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
    )
    assert (
        wayback_direct_url(
            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
        )
        == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
    )
    assert (
        wayback_direct_url(
            "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf"
        )
        == "https://web.archive.org/web/20170811115414id_/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf"
    )


def make_access_redirect_url(work_ident: str, access_type: str, access_url: str) -> str:
    if access_type == "wayback" and "://web.archive.org/" in access_url:
        segments = access_url.split("/")
        original_url = "/".join(segments[5:])
        return f"https://scholar.archive.org/work/{work_ident}/access/wayback/{original_url}"
    elif access_type == "ia_file" and "://archive.org/download/" in access_url:
        suffix = "/".join(access_url.split("/")[4:])
        return f"https://scholar.archive.org/work/{work_ident}/access/ia_file/{suffix}"
    else:
        return access_url


def test_make_access_redirect_url() -> None:
    assert (
        make_access_redirect_url(
            "lmobci36t5aelogzjsazuwxpie",
            "wayback",
            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf",
        )
        == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf"
    )
    assert (
        make_access_redirect_url(
            "lmobci36t5aelogzjsazuwxpie",
            "wayback",
            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf",
        )
        == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf?param=asdf"
    )
    assert (
        make_access_redirect_url(
            "lmobci36t5aelogzjsazuwxpie",
            "ia_file",
            "https://archive.org/download/something/file.pdf",
        )
        == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/ia_file/something/file.pdf"
    )
    assert (
        make_access_redirect_url(
            "lmobci36t5aelogzjsazuwxpie", "blah", "https://mit.edu/file.pdf"
        )
        == "https://mit.edu/file.pdf"
    )
    assert (
        make_access_redirect_url(
            "lmobci36t5aelogzjsazuwxpie",
            "wayback",
            "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf",
        )
        == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf"
    )