From c129824ddee29d130eedfaa47468365cf6825740 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 12 Aug 2022 11:56:57 -0700 Subject: rename fatcat_scholar.hacks to fatcat_scholar.web_hacks --- fatcat_scholar/hacks.py | 179 -------------------------------------------- fatcat_scholar/web.py | 2 +- fatcat_scholar/web_hacks.py | 179 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 180 deletions(-) delete mode 100644 fatcat_scholar/hacks.py create mode 100644 fatcat_scholar/web_hacks.py diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py deleted file mode 100644 index 2be90f0..0000000 --- a/fatcat_scholar/hacks.py +++ /dev/null @@ -1,179 +0,0 @@ -import typing - -import jinja2 -from starlette.background import BackgroundTask -from starlette.templating import _TemplateResponse - - -class Jinja2Templates: - """ - This is a patched version of starlette.templating.Jinja2Templates that - supports extensions (list of strings) passed to jinja2.Environment - """ - - def __init__(self, directory: str, extensions: typing.List[str] = []) -> None: - assert jinja2 is not None, "jinja2 must be installed to use Jinja2Templates" - self.env = self.get_env(directory, extensions) - - def get_env( - self, directory: str, extensions: typing.List[str] = [] - ) -> "jinja2.Environment": - @jinja2.pass_context - def url_for(context: dict, name: str, **path_params: typing.Any) -> str: - request = context["request"] - return request.url_for(name, **path_params) - - loader = jinja2.FileSystemLoader(directory) - env = jinja2.Environment(loader=loader, extensions=extensions, autoescape=True) - env.globals["url_for"] = url_for - return env - - def get_template(self, name: str) -> "jinja2.Template": - return self.env.get_template(name) - - def TemplateResponse( - self, - name: str, - context: dict, - status_code: int = 200, - headers: dict = None, - media_type: str = None, - background: BackgroundTask = None, - ) -> _TemplateResponse: - if "request" not in context: - raise ValueError('context must include a "request" key') - template = self.get_template(name) - return _TemplateResponse( - template, - context, - status_code=status_code, - headers=headers, - media_type=media_type, - background=background, - ) - - -def parse_accept_lang(header: str, options: typing.List[str]) -> typing.Optional[str]: - """ - Crude HTTP Accept-Language content negotiation. - Assumes that languages are specified in order of priority, etc. - """ - if not header: - return None - chunks = [v.split(";")[0].split("-")[0].split("_")[0] for v in header.split(",")] - for c in chunks: - if len(c) == 2 and c in options: - return c - return None - - -def test_parse_accept_lang() -> None: - assert parse_accept_lang("", []) is None - assert parse_accept_lang("en,de", []) is None - assert parse_accept_lang("en,de", ["en"]) == "en" - assert parse_accept_lang("en-GB,de", ["en"]) == "en" - assert parse_accept_lang("zh_Hans_CN", ["en", "zh"]) == "zh" - assert parse_accept_lang("en,de", ["de"]) == "de" - assert ( - parse_accept_lang("en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["de"]) - == "de" - ) - assert ( - parse_accept_lang( - "en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["en", "de"] - ) - == "en" - ) - assert ( - parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"]) - == "en" - ) - - -def wayback_direct_url(url: str) -> str: - """ - Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) - """ - if "://web.archive.org" not in url: - return url - segments = url.split("/") - if len(segments) < 6 or not segments[4].isdigit(): - return url - segments[4] += "id_" - return "/".join(segments) - - -def test_wayback_direct_url() -> None: - assert ( - wayback_direct_url("http://fatcat.wiki/thing.pdf") - == "http://fatcat.wiki/thing.pdf" - ) - assert ( - wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") - == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" - ) - assert ( - wayback_direct_url( - "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" - ) - == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" - ) - assert ( - wayback_direct_url( - "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" - ) - == "https://web.archive.org/web/20170811115414id_/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" - ) - - -def make_access_redirect_url(work_ident: str, access_type: str, access_url: str) -> str: - if access_type == "wayback" and "://web.archive.org/" in access_url: - segments = access_url.split("/") - original_url = "/".join(segments[5:]) - return f"https://scholar.archive.org/work/{work_ident}/access/wayback/{original_url}" - elif access_type == "ia_file" and "://archive.org/download/" in access_url: - suffix = "/".join(access_url.split("/")[4:]) - return f"https://scholar.archive.org/work/{work_ident}/access/ia_file/{suffix}" - else: - return access_url - - -def test_make_access_redirect_url() -> None: - assert ( - make_access_redirect_url( - "lmobci36t5aelogzjsazuwxpie", - "wayback", - "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf", - ) - == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf" - ) - assert ( - make_access_redirect_url( - "lmobci36t5aelogzjsazuwxpie", - "wayback", - "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf", - ) - == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf?param=asdf" - ) - assert ( - make_access_redirect_url( - "lmobci36t5aelogzjsazuwxpie", - "ia_file", - "https://archive.org/download/something/file.pdf", - ) - == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/ia_file/something/file.pdf" - ) - assert ( - make_access_redirect_url( - "lmobci36t5aelogzjsazuwxpie", "blah", "https://mit.edu/file.pdf" - ) - == "https://mit.edu/file.pdf" - ) - assert ( - make_access_redirect_url( - "lmobci36t5aelogzjsazuwxpie", - "wayback", - "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf", - ) - == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" - ) diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 18299ee..1e04400 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -29,7 +29,7 @@ from starlette.exceptions import HTTPException as StarletteHTTPException from starlette_prometheus import PrometheusMiddleware, metrics from fatcat_scholar.config import GIT_REVISION, settings -from fatcat_scholar.hacks import ( +from fatcat_scholar.web_hacks import ( Jinja2Templates, make_access_redirect_url, parse_accept_lang, diff --git a/fatcat_scholar/web_hacks.py b/fatcat_scholar/web_hacks.py new file mode 100644 index 0000000..2be90f0 --- /dev/null +++ b/fatcat_scholar/web_hacks.py @@ -0,0 +1,179 @@ +import typing + +import jinja2 +from starlette.background import BackgroundTask +from starlette.templating import _TemplateResponse + + +class Jinja2Templates: + """ + This is a patched version of starlette.templating.Jinja2Templates that + supports extensions (list of strings) passed to jinja2.Environment + """ + + def __init__(self, directory: str, extensions: typing.List[str] = []) -> None: + assert jinja2 is not None, "jinja2 must be installed to use Jinja2Templates" + self.env = self.get_env(directory, extensions) + + def get_env( + self, directory: str, extensions: typing.List[str] = [] + ) -> "jinja2.Environment": + @jinja2.pass_context + def url_for(context: dict, name: str, **path_params: typing.Any) -> str: + request = context["request"] + return request.url_for(name, **path_params) + + loader = jinja2.FileSystemLoader(directory) + env = jinja2.Environment(loader=loader, extensions=extensions, autoescape=True) + env.globals["url_for"] = url_for + return env + + def get_template(self, name: str) -> "jinja2.Template": + return self.env.get_template(name) + + def TemplateResponse( + self, + name: str, + context: dict, + status_code: int = 200, + headers: dict = None, + media_type: str = None, + background: BackgroundTask = None, + ) -> _TemplateResponse: + if "request" not in context: + raise ValueError('context must include a "request" key') + template = self.get_template(name) + return _TemplateResponse( + template, + context, + status_code=status_code, + headers=headers, + media_type=media_type, + background=background, + ) + + +def parse_accept_lang(header: str, options: typing.List[str]) -> typing.Optional[str]: + """ + Crude HTTP Accept-Language content negotiation. + Assumes that languages are specified in order of priority, etc. + """ + if not header: + return None + chunks = [v.split(";")[0].split("-")[0].split("_")[0] for v in header.split(",")] + for c in chunks: + if len(c) == 2 and c in options: + return c + return None + + +def test_parse_accept_lang() -> None: + assert parse_accept_lang("", []) is None + assert parse_accept_lang("en,de", []) is None + assert parse_accept_lang("en,de", ["en"]) == "en" + assert parse_accept_lang("en-GB,de", ["en"]) == "en" + assert parse_accept_lang("zh_Hans_CN", ["en", "zh"]) == "zh" + assert parse_accept_lang("en,de", ["de"]) == "de" + assert ( + parse_accept_lang("en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["de"]) + == "de" + ) + assert ( + parse_accept_lang( + "en-ca,en;q=0.8,en-us;q=0.6,de-de;q=0.4,de;q=0.2", ["en", "de"] + ) + == "en" + ) + assert ( + parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"]) + == "en" + ) + + +def wayback_direct_url(url: str) -> str: + """ + Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) + """ + if "://web.archive.org" not in url: + return url + segments = url.split("/") + if len(segments) < 6 or not segments[4].isdigit(): + return url + segments[4] += "id_" + return "/".join(segments) + + +def test_wayback_direct_url() -> None: + assert ( + wayback_direct_url("http://fatcat.wiki/thing.pdf") + == "http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") + == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" + ) + == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" + ) + == "https://web.archive.org/web/20170811115414id_/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" + ) + + +def make_access_redirect_url(work_ident: str, access_type: str, access_url: str) -> str: + if access_type == "wayback" and "://web.archive.org/" in access_url: + segments = access_url.split("/") + original_url = "/".join(segments[5:]) + return f"https://scholar.archive.org/work/{work_ident}/access/wayback/{original_url}" + elif access_type == "ia_file" and "://archive.org/download/" in access_url: + suffix = "/".join(access_url.split("/")[4:]) + return f"https://scholar.archive.org/work/{work_ident}/access/ia_file/{suffix}" + else: + return access_url + + +def test_make_access_redirect_url() -> None: + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "wayback", + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "wayback", + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://fatcat.wiki/thing.pdf?param=asdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "ia_file", + "https://archive.org/download/something/file.pdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/ia_file/something/file.pdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", "blah", "https://mit.edu/file.pdf" + ) + == "https://mit.edu/file.pdf" + ) + assert ( + make_access_redirect_url( + "lmobci36t5aelogzjsazuwxpie", + "wayback", + "https://web.archive.org/web/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf", + ) + == "https://scholar.archive.org/work/lmobci36t5aelogzjsazuwxpie/access/wayback/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf" + ) -- cgit v1.2.3