iterate on PDF redirect links

author: Bryan Newbold <bnewbold@archive.org> 2021-05-17 21:04:29 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-05-17 21:04:29 -0700
commit: f767a344c1ed7722b79710c6f3c61d5802f78860 (patch)
tree: 8a4779f4c83687965a80127316274967f14f4685
parent: f4ffc6863ec7d08a195cb8cb5370a153d093454e (diff)
download: fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.tar.gz
fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.zip
5 files changed, 149 insertions, 39 deletions
diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py
index 5cb8572..1b53e01 100644
--- a/fatcat_scholar/hacks.py
+++ b/fatcat_scholar/hacks.py
@@ -88,3 +88,72 @@ def test_parse_accept_lang() -> None:
         parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"])
         == "en"
     )
+
+
+def wayback_direct_url(url: str) -> str:
+    """
+    Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
+    """
+    if not "://web.archive.org" in url:
+        return url
+    segments = url.split("/")
+    if len(segments) < 6 or not segments[4].isdigit():
+        return url
+    segments[4] += "id_"
+    return "/".join(segments)
+
+
+def test_wayback_direct_url() -> None:
+    assert (
+        wayback_direct_url("http://fatcat.wiki/thing.pdf")
+        == "http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
+        == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        wayback_direct_url(
+            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+        )
+        == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
+    )
+
+
+def make_access_redirect_url(access_type: str, access_url: str) -> str:
+    if access_type == "wayback" and "://web.archive.org/" in access_url:
+        segments = access_url.split("/")
+        dt = segments[4]
+        original_url = "/".join(segments[5:])
+        return f"https://scholar.archive.org/access/wayback/{dt}/{original_url}"
+    elif access_type == "ia_file" and "://archive.org/download/" in access_url:
+        suffix = "/".join(access_url.split("/")[4:])
+        return f"https://scholar.archive.org/access/ia_file/{suffix}"
+    else:
+        return access_url
+
+
+def test_make_access_redirect_url() -> None:
+    assert (
+        make_access_redirect_url(
+            "wayback", "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+        )
+        == "https://scholar.archive.org/access/wayback/1234/http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        make_access_redirect_url(
+            "wayback",
+            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf",
+        )
+        == "https://scholar.archive.org/access/wayback/1234/http://fatcat.wiki/thing.pdf?param=asdf"
+    )
+    assert (
+        make_access_redirect_url(
+            "ia_file", "https://archive.org/download/something/file.pdf"
+        )
+        == "https://scholar.archive.org/access/ia_file/something/file.pdf"
+    )
+    assert (
+        make_access_redirect_url("blah", "https://mit.edu/file.pdf")
+        == "https://mit.edu/file.pdf"
+    )
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 2c02a58..b9ede1d 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -463,7 +463,7 @@ def get_es_scholar_doc(key: str) -> Optional[dict]:
     return doc
 
 
-def lookup_fulltext_pdf(sha1: str) -> Optional[dict]:
+def lookup_fulltext_pdf(sha1: str) -> Optional[ScholarFulltext]:
     """
     Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document.
     """
diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html
index 611576b..67c87e0 100644
--- a/fatcat_scholar/templates/work.html
+++ b/fatcat_scholar/templates/work.html
@@ -30,10 +30,10 @@
 {% if work.biblio.doi %}
   <meta name="citation_doi" content="{{ work.biblio.doi }}">
 {% endif %}
-{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype == "application/pdf" and work.fulltext.file_sha1 %}
-  <!-- PDF access redirect URL, as requested by, eg, scholar.google.com -->
-  <meta name="citation_pdf_url" content="https://scholar.archive.org/access-redirect/{{ work.fulltext.file_sha1 }}.pdf">
-  <!-- Multiple URLs allowed? <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> -->
+{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype in ["application/pdf", None] and work.fulltext.file_sha1 %}
+  <!-- single PDF access redirect URL -->
+  <meta name="citation_pdf_url" content="{{ make_access_redirect_url(work.fulltext.access_type, work.fulltext.access_url) }}">
+  <!-- direct URL: {{ work.fulltext.access_url | safe }} -->
 {% endif %}
 
 {% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 3c8b8f3..f2ef331 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -5,6 +5,7 @@ So far there are few endpoints, so we just put them all here!
 """
 
 import logging
+import urllib.parse
 from typing import Optional, Any, List, Dict
 
 from pydantic import BaseModel
@@ -25,7 +26,12 @@ from starlette_prometheus import metrics, PrometheusMiddleware
 from starlette.exceptions import HTTPException as StarletteHTTPException
 
 from fatcat_scholar.config import settings, GIT_REVISION
-from fatcat_scholar.hacks import Jinja2Templates, parse_accept_lang
+from fatcat_scholar.hacks import (
+    Jinja2Templates,
+    parse_accept_lang,
+    wayback_direct_url,
+    make_access_redirect_url,
+)
 from fatcat_scholar.search import (
     process_query,
     FulltextQuery,
@@ -177,42 +183,15 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
     return doc
 
 
-def wayback_direct_url(url: str) -> str:
-    """
-    Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
-    """
-    if not "://web.archive.org" in url:
-        return url
-    segments = url.split("/")
-    if len(segments) < 6 or not segments[4].isdigit():
-        return url
-    segments[4] += "id_"
-    return "/".join(segments)
-
-
-def test_wayback_direct_url() -> None:
-    assert (
-        wayback_direct_url("http://fatcat.wiki/thing.pdf")
-        == "http://fatcat.wiki/thing.pdf"
-    )
-    assert (
-        wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
-        == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
-    )
-    assert (
-        wayback_direct_url(
-            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
-        )
-        == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
-    )
-
-
 @api.get(
     "/access-redirect/{sha1}.pdf",
     operation_id="access_redirect_pdf",
     include_in_schema=False,
 )
 def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
+    """
+    NOTE: DEPRECATED
+    """
     fulltext = lookup_fulltext_pdf(sha1)
     if not fulltext or not fulltext.access_url:
         raise HTTPException(status_code=404, detail="PDF file not found")
@@ -222,6 +201,28 @@ def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) ->
     return RedirectResponse(access_url, status_code=302)
 
 
+@api.get(
+    "/access/wayback/{timestamp}/{url:path}",
+    operation_id="access_redirect_wayback",
+    include_in_schema=False,
+)
+def access_redirect_wayback(timestamp: int, url: str, request: Request) -> Any:
+    original_url = "/".join(str(request.url).split("/")[6:])
+    access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+    return RedirectResponse(access_url, status_code=302)
+
+
+@api.get(
+    "/access/ia_file/{item}/{file_path:path}",
+    operation_id="access_redirect_ia_file",
+    include_in_schema=False,
+)
+def access_redirect_ia_file(item: str, file_path: str, request: Request) -> Any:
+    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[6:]))
+    access_url = f"https://archive.org/download/{item}/{original_path}"
+    return RedirectResponse(access_url, status_code=302)
+
+
 web = APIRouter()
 
 
@@ -270,6 +271,7 @@ def load_i18n_templates() -> Any:
         # pass-through application settings to be available in templates
         templates.env.globals["settings"] = settings
         templates.env.globals["babel_numbers"] = babel.numbers
+        templates.env.globals["make_access_redirect_url"] = make_access_redirect_url
         d[lang_opt] = templates
     return d
 
@@ -414,6 +416,7 @@ async def favicon() -> Any:
         "fatcat_scholar/static/ia-favicon.ico", media_type="image/x-icon"
     )
 
+
 @app.get("/sitemap.xml", include_in_schema=False)
 async def basic_sitemap() -> Any:
     return FileResponse(
diff --git a/tests/test_web.py b/tests/test_web.py
index 6c6632d..fc7ea14 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -102,6 +102,7 @@ def test_basic_search(client: Any, mocker: Any) -> None:
     rv = client.get("/zh/search?q=blood")
     assert rv.status_code == 200
 
+
 def test_basic_work_landing_page(client: Any, mocker: Any) -> None:
 
     with open("tests/files/elastic_fulltext_get.json") as f:
@@ -122,7 +123,11 @@ def test_basic_work_landing_page(client: Any, mocker: Any) -> None:
     rv = client.get("/zh/work/2x5qvct2dnhrbctqa2q2uyut6a")
     assert rv.status_code == 200
 
+
 def test_basic_access_redirect(client: Any, mocker: Any) -> None:
+    """
+    NOTE: DEPRECATED
+    """
 
     with open("tests/files/elastic_fulltext_search.json") as f:
         elastic_resp = json.loads(f.read())
@@ -135,9 +140,42 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
         (200, {}, json.dumps(elastic_resp)),
     ]
 
-    rv = client.get("/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf", allow_redirects=False)
+    rv = client.get(
+        "/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf",
+        allow_redirects=False,
+    )
     assert rv.status_code == 302
-    assert rv.headers['Location'] == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
+    assert (
+        rv.headers["Location"]
+        == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
+    )
 
-    rv = client.get("/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf", allow_redirects=False)
+    rv = client.get(
+        "/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf",
+        allow_redirects=False,
+    )
     assert rv.status_code == 404
+
+
+def test_access_redirects(client: Any, mocker: Any) -> None:
+
+    # tricky "URL encoding in archive.org path" case
+    rv = client.get(
+        "/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 302
+    assert (
+        rv.headers["Location"]
+        == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf"
+    )
+
+    rv = client.get(
+        "/access/wayback/20170814015956/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 302
+    assert (
+        rv.headers["Location"]
+        == "https://web.archive.org/web/20170814015956id_/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf"
+    )
author	Bryan Newbold <bnewbold@archive.org>	2021-05-17 21:04:29 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-05-17 21:04:29 -0700
commit	f767a344c1ed7722b79710c6f3c61d5802f78860 (patch)
tree	8a4779f4c83687965a80127316274967f14f4685
parent	f4ffc6863ec7d08a195cb8cb5370a153d093454e (diff)
download	fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.tar.gz fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.zip