diff options
Diffstat (limited to 'fatcat_scholar/web_hacks.py')
-rw-r--r-- | fatcat_scholar/web_hacks.py | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/fatcat_scholar/web_hacks.py b/fatcat_scholar/web_hacks.py new file mode 100644 index 0000000..2be90f0 --- /dev/null +++ b/fatcat_scholar/web_hacks.py @@ -0,0 +1,179 @@ +import typing + +import jinja2 +from starlette.background import BackgroundTask +from starlette.templating import _TemplateResponse + + +class Jinja2Templates: + """ + This is a patched version of starlette.templating.Jinja2Templates that + supports extensions (list of strings) passed to jinja2.Environment + """ + + def __init__(self, directory: str, extensions: typing.List[str] = []) -> None: + assert jinja2 is not None, "jinja2 must be installed to use Jinja2Templates" + self.env = self.get_env(directory, extensions) + + def get_env( + self, directory: str, extensions: typing.List[str] = [] + ) -> "jinja2.Environment": + @jinja2.pass_context + def url_for(context: dict, name: str, **path_params: typing.Any) -> str: + request = context["request"] + return request.url_for(name, **path_params) + + loader = jinja2.FileSystemLoader(directory) + env = jinja2.Environment(loader=loader, extensions=extensions, autoescape=True) + env.globals["url_for"] = url_for + return env + + def get_template(self, name: str) -> "jinja2.Template": + return self.env.get_template(name) + + def TemplateResponse( + self, + name: str, + context: dict, + status_code: int = 200, + headers: dict = None, + media_type: str = None, + background: BackgroundTask = None, + ) -> _TemplateResponse: + if "request" not in context: + raise ValueError('context must include a "request" key') + template = self.get_template(name) + return _TemplateResponse( + template, + context, + status_code=status_code, + headers=headers, + media_type=media_type, + background=background, + ) + + +def parse_accept_lang(header: str, options: typing.List[str]) -> typing.Optional[str]: + """ + Crude HTTP Accept-Language content negotiation. + Assumes that languages are specified in order of priority, etc. + """ + if not header: + return None + chunks = [v.split(";")[0].split("-")[0].split("_")[0] for v in header.split(",")] + for c in chunks: + if len(c) == 2 and c in options: + return c + return None + + +def test_parse_accept_lang() -> None: + assert parse_accept_lang("", []) is None + assert parse_accept_lang("en,de", []) is None + assert parse_accept_lang("en,de", ["en"]) == "en" + assert parse_accept_lang("en-GB,de", ["en"]) == "en" + assert parse_accept_lang("zh_Hans_CN", ["en", "zh"]) == "zh" + assert parse_accept_lang("en,de", ["de"]) == "de" + assert ( + parse_accept_lang("en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["de"]) + == "de" + ) + assert ( + parse_accept_lang( + "en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["en", "de"] + ) + == "en" + ) + assert ( + parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"]) + == "en" + ) + + +def wayback_direct_url(url: str) -> str: + """ + Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) + """ + if "://web.archive.org" not in url: + return url + segments = url.split("/") + if len(segments) < 6 or not segments[4].isdigit(): + return url + segments[4] += "id_" + return "/".join(segments) + + +def test_wayback_direct_url() -> None: + assert ( + wayback_direct_url("http://fatcat.wiki/thing.pdf") + == "http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") + == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" + ) + == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" + ) + == "https://web.archive.org/web/20170811115414id_/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" + ) + + +def make_access_redirect_url(work_ident: str, access_type: str, access_url: str) -> str: + if access_type == "wayback" and "://web.archive.org/" in access_url: + segments = access_url.split("/") + original_url = "/".join(segments[5:]) + return f"https://scholar.archive.org/work/{work_ident}/access/wayback/{original_url}" + elif access_type == "ia_file" and "://archive.org/download/" in access_url: + suffix = "/".join(access_url.split("/")[4:]) + return f"https://scholar.archive.org/work/{work_ident}/access/ia_file/{suffix}" + else: + return access_url + + +def test_make_access_redirect_url() -> None: + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "wayback", + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "wayback", + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf?param=asdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "ia_file", + "https://archive.org/download/something/file.pdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/ia_file/something/file.pdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", "blah", "https://mit.edu/file.pdf" + ) + == "https://mit.edu/file.pdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "wayback", + "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" + ) |