diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-05-17 21:04:29 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-05-17 21:04:29 -0700 |
commit | f767a344c1ed7722b79710c6f3c61d5802f78860 (patch) | |
tree | 8a4779f4c83687965a80127316274967f14f4685 | |
parent | f4ffc6863ec7d08a195cb8cb5370a153d093454e (diff) | |
download | fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.tar.gz fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.zip |
iterate on PDF redirect links
-rw-r--r-- | fatcat_scholar/hacks.py | 69 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/templates/work.html | 8 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 65 | ||||
-rw-r--r-- | tests/test_web.py | 44 |
5 files changed, 149 insertions, 39 deletions
diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py index 5cb8572..1b53e01 100644 --- a/fatcat_scholar/hacks.py +++ b/fatcat_scholar/hacks.py @@ -88,3 +88,72 @@ def test_parse_accept_lang() -> None: parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"]) == "en" ) + + +def wayback_direct_url(url: str) -> str: + """ + Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) + """ + if not "://web.archive.org" in url: + return url + segments = url.split("/") + if len(segments) < 6 or not segments[4].isdigit(): + return url + segments[4] += "id_" + return "/".join(segments) + + +def test_wayback_direct_url() -> None: + assert ( + wayback_direct_url("http://fatcat.wiki/thing.pdf") + == "http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") + == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" + ) + == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" + ) + + +def make_access_redirect_url(access_type: str, access_url: str) -> str: + if access_type == "wayback" and "://web.archive.org/" in access_url: + segments = access_url.split("/") + dt = segments[4] + original_url = "/".join(segments[5:]) + return f"https://scholar.archive.org/access/wayback/{dt}/{original_url}" + elif access_type == "ia_file" and "://archive.org/download/" in access_url: + suffix = "/".join(access_url.split("/")[4:]) + return f"https://scholar.archive.org/access/ia_file/{suffix}" + else: + return access_url + + +def test_make_access_redirect_url() -> None: + assert ( + make_access_redirect_url( + "wayback", "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" + ) + == "https://scholar.archive.org/access/wayback/1234/http://fatcat.wiki/thing.pdf" + ) + assert ( + make_access_redirect_url( + "wayback", + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf", + ) + == "https://scholar.archive.org/access/wayback/1234/http://fatcat.wiki/thing.pdf?param=asdf" + ) + assert ( + make_access_redirect_url( + "ia_file", "https://archive.org/download/something/file.pdf" + ) + == "https://scholar.archive.org/access/ia_file/something/file.pdf" + ) + assert ( + make_access_redirect_url("blah", "https://mit.edu/file.pdf") + == "https://mit.edu/file.pdf" + ) diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 2c02a58..b9ede1d 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -463,7 +463,7 @@ def get_es_scholar_doc(key: str) -> Optional[dict]: return doc -def lookup_fulltext_pdf(sha1: str) -> Optional[dict]: +def lookup_fulltext_pdf(sha1: str) -> Optional[ScholarFulltext]: """ Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document. """ diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html index 611576b..67c87e0 100644 --- a/fatcat_scholar/templates/work.html +++ b/fatcat_scholar/templates/work.html @@ -30,10 +30,10 @@ {% if work.biblio.doi %} <meta name="citation_doi" content="{{ work.biblio.doi }}"> {% endif %} -{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype == "application/pdf" and work.fulltext.file_sha1 %} - <!-- PDF access redirect URL, as requested by, eg, scholar.google.com --> - <meta name="citation_pdf_url" content="https://scholar.archive.org/access-redirect/{{ work.fulltext.file_sha1 }}.pdf"> - <!-- Multiple URLs allowed? <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> --> +{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype in ["application/pdf", None] and work.fulltext.file_sha1 %} + <!-- single PDF access redirect URL --> + <meta name="citation_pdf_url" content="{{ make_access_redirect_url(work.fulltext.access_type, work.fulltext.access_url) }}"> + <!-- direct URL: {{ work.fulltext.access_url | safe }} --> {% endif %} {% endblock %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 3c8b8f3..f2ef331 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -5,6 +5,7 @@ So far there are few endpoints, so we just put them all here! """ import logging +import urllib.parse from typing import Optional, Any, List, Dict from pydantic import BaseModel @@ -25,7 +26,12 @@ from starlette_prometheus import metrics, PrometheusMiddleware from starlette.exceptions import HTTPException as StarletteHTTPException from fatcat_scholar.config import settings, GIT_REVISION -from fatcat_scholar.hacks import Jinja2Templates, parse_accept_lang +from fatcat_scholar.hacks import ( + Jinja2Templates, + parse_accept_lang, + wayback_direct_url, + make_access_redirect_url, +) from fatcat_scholar.search import ( process_query, FulltextQuery, @@ -177,42 +183,15 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict return doc -def wayback_direct_url(url: str) -> str: - """ - Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) - """ - if not "://web.archive.org" in url: - return url - segments = url.split("/") - if len(segments) < 6 or not segments[4].isdigit(): - return url - segments[4] += "id_" - return "/".join(segments) - - -def test_wayback_direct_url() -> None: - assert ( - wayback_direct_url("http://fatcat.wiki/thing.pdf") - == "http://fatcat.wiki/thing.pdf" - ) - assert ( - wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") - == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" - ) - assert ( - wayback_direct_url( - "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" - ) - == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" - ) - - @api.get( "/access-redirect/{sha1}.pdf", operation_id="access_redirect_pdf", include_in_schema=False, ) def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any: + """ + NOTE: DEPRECATED + """ fulltext = lookup_fulltext_pdf(sha1) if not fulltext or not fulltext.access_url: raise HTTPException(status_code=404, detail="PDF file not found") @@ -222,6 +201,28 @@ def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> return RedirectResponse(access_url, status_code=302) +@api.get( + "/access/wayback/{timestamp}/{url:path}", + operation_id="access_redirect_wayback", + include_in_schema=False, +) +def access_redirect_wayback(timestamp: int, url: str, request: Request) -> Any: + original_url = "/".join(str(request.url).split("/")[6:]) + access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" + return RedirectResponse(access_url, status_code=302) + + +@api.get( + "/access/ia_file/{item}/{file_path:path}", + operation_id="access_redirect_ia_file", + include_in_schema=False, +) +def access_redirect_ia_file(item: str, file_path: str, request: Request) -> Any: + original_path = urllib.parse.quote("/".join(str(request.url).split("/")[6:])) + access_url = f"https://archive.org/download/{item}/{original_path}" + return RedirectResponse(access_url, status_code=302) + + web = APIRouter() @@ -270,6 +271,7 @@ def load_i18n_templates() -> Any: # pass-through application settings to be available in templates templates.env.globals["settings"] = settings templates.env.globals["babel_numbers"] = babel.numbers + templates.env.globals["make_access_redirect_url"] = make_access_redirect_url d[lang_opt] = templates return d @@ -414,6 +416,7 @@ async def favicon() -> Any: "fatcat_scholar/static/ia-favicon.ico", media_type="image/x-icon" ) + @app.get("/sitemap.xml", include_in_schema=False) async def basic_sitemap() -> Any: return FileResponse( diff --git a/tests/test_web.py b/tests/test_web.py index 6c6632d..fc7ea14 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -102,6 +102,7 @@ def test_basic_search(client: Any, mocker: Any) -> None: rv = client.get("/zh/search?q=blood") assert rv.status_code == 200 + def test_basic_work_landing_page(client: Any, mocker: Any) -> None: with open("tests/files/elastic_fulltext_get.json") as f: @@ -122,7 +123,11 @@ def test_basic_work_landing_page(client: Any, mocker: Any) -> None: rv = client.get("/zh/work/2x5qvct2dnhrbctqa2q2uyut6a") assert rv.status_code == 200 + def test_basic_access_redirect(client: Any, mocker: Any) -> None: + """ + NOTE: DEPRECATED + """ with open("tests/files/elastic_fulltext_search.json") as f: elastic_resp = json.loads(f.read()) @@ -135,9 +140,42 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: (200, {}, json.dumps(elastic_resp)), ] - rv = client.get("/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf", allow_redirects=False) + rv = client.get( + "/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf", + allow_redirects=False, + ) assert rv.status_code == 302 - assert rv.headers['Location'] == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" + assert ( + rv.headers["Location"] + == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" + ) - rv = client.get("/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf", allow_redirects=False) + rv = client.get( + "/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf", + allow_redirects=False, + ) assert rv.status_code == 404 + + +def test_access_redirects(client: Any, mocker: Any) -> None: + + # tricky "URL encoding in archive.org path" case + rv = client.get( + "/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert ( + rv.headers["Location"] + == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf" + ) + + rv = client.get( + "/access/wayback/20170814015956/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert ( + rv.headers["Location"] + == "https://web.archive.org/web/20170814015956id_/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf" + ) |