aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-05-17 21:04:29 -0700
committerBryan Newbold <bnewbold@archive.org>2021-05-17 21:04:29 -0700
commitf767a344c1ed7722b79710c6f3c61d5802f78860 (patch)
tree8a4779f4c83687965a80127316274967f14f4685
parentf4ffc6863ec7d08a195cb8cb5370a153d093454e (diff)
downloadfatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.tar.gz
fatcat-scholar-f767a344c1ed7722b79710c6f3c61d5802f78860.zip
iterate on PDF redirect links
-rw-r--r--fatcat_scholar/hacks.py69
-rw-r--r--fatcat_scholar/search.py2
-rw-r--r--fatcat_scholar/templates/work.html8
-rw-r--r--fatcat_scholar/web.py65
-rw-r--r--tests/test_web.py44
5 files changed, 149 insertions, 39 deletions
diff --git a/fatcat_scholar/hacks.py b/fatcat_scholar/hacks.py
index 5cb8572..1b53e01 100644
--- a/fatcat_scholar/hacks.py
+++ b/fatcat_scholar/hacks.py
@@ -88,3 +88,72 @@ def test_parse_accept_lang() -> None:
parse_accept_lang("en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7", ["zh", "en", "de"])
== "en"
)
+
+
+def wayback_direct_url(url: str) -> str:
+ """
+ Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
+ """
+ if not "://web.archive.org" in url:
+ return url
+ segments = url.split("/")
+ if len(segments) < 6 or not segments[4].isdigit():
+ return url
+ segments[4] += "id_"
+ return "/".join(segments)
+
+
+def test_wayback_direct_url() -> None:
+ assert (
+ wayback_direct_url("http://fatcat.wiki/thing.pdf")
+ == "http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
+ == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ wayback_direct_url(
+ "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+ )
+ == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
+ )
+
+
+def make_access_redirect_url(access_type: str, access_url: str) -> str:
+ if access_type == "wayback" and "://web.archive.org/" in access_url:
+ segments = access_url.split("/")
+ dt = segments[4]
+ original_url = "/".join(segments[5:])
+ return f"https://scholar.archive.org/access/wayback/{dt}/{original_url}"
+ elif access_type == "ia_file" and "://archive.org/download/" in access_url:
+ suffix = "/".join(access_url.split("/")[4:])
+ return f"https://scholar.archive.org/access/ia_file/{suffix}"
+ else:
+ return access_url
+
+
+def test_make_access_redirect_url() -> None:
+ assert (
+ make_access_redirect_url(
+ "wayback", "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+ )
+ == "https://scholar.archive.org/access/wayback/1234/http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ make_access_redirect_url(
+ "wayback",
+ "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf?param=asdf",
+ )
+ == "https://scholar.archive.org/access/wayback/1234/http://fatcat.wiki/thing.pdf?param=asdf"
+ )
+ assert (
+ make_access_redirect_url(
+ "ia_file", "https://archive.org/download/something/file.pdf"
+ )
+ == "https://scholar.archive.org/access/ia_file/something/file.pdf"
+ )
+ assert (
+ make_access_redirect_url("blah", "https://mit.edu/file.pdf")
+ == "https://mit.edu/file.pdf"
+ )
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 2c02a58..b9ede1d 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -463,7 +463,7 @@ def get_es_scholar_doc(key: str) -> Optional[dict]:
return doc
-def lookup_fulltext_pdf(sha1: str) -> Optional[dict]:
+def lookup_fulltext_pdf(sha1: str) -> Optional[ScholarFulltext]:
"""
Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document.
"""
diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html
index 611576b..67c87e0 100644
--- a/fatcat_scholar/templates/work.html
+++ b/fatcat_scholar/templates/work.html
@@ -30,10 +30,10 @@
{% if work.biblio.doi %}
<meta name="citation_doi" content="{{ work.biblio.doi }}">
{% endif %}
-{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype == "application/pdf" and work.fulltext.file_sha1 %}
- <!-- PDF access redirect URL, as requested by, eg, scholar.google.com -->
- <meta name="citation_pdf_url" content="https://scholar.archive.org/access-redirect/{{ work.fulltext.file_sha1 }}.pdf">
- <!-- Multiple URLs allowed? <meta name="citation_pdf_url" content="{{ work.fulltext.access_url }}"> -->
+{% if work.fulltext.access_url and work.biblio.release_ident == work.fulltext.release_ident and work.fulltext.access_type in ['wayback', 'ia_file'] and work.fulltext.file_mimetype in ["application/pdf", None] and work.fulltext.file_sha1 %}
+ <!-- single PDF access redirect URL -->
+ <meta name="citation_pdf_url" content="{{ make_access_redirect_url(work.fulltext.access_type, work.fulltext.access_url) }}">
+ <!-- direct URL: {{ work.fulltext.access_url | safe }} -->
{% endif %}
{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 3c8b8f3..f2ef331 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -5,6 +5,7 @@ So far there are few endpoints, so we just put them all here!
"""
import logging
+import urllib.parse
from typing import Optional, Any, List, Dict
from pydantic import BaseModel
@@ -25,7 +26,12 @@ from starlette_prometheus import metrics, PrometheusMiddleware
from starlette.exceptions import HTTPException as StarletteHTTPException
from fatcat_scholar.config import settings, GIT_REVISION
-from fatcat_scholar.hacks import Jinja2Templates, parse_accept_lang
+from fatcat_scholar.hacks import (
+ Jinja2Templates,
+ parse_accept_lang,
+ wayback_direct_url,
+ make_access_redirect_url,
+)
from fatcat_scholar.search import (
process_query,
FulltextQuery,
@@ -177,42 +183,15 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
return doc
-def wayback_direct_url(url: str) -> str:
- """
- Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
- """
- if not "://web.archive.org" in url:
- return url
- segments = url.split("/")
- if len(segments) < 6 or not segments[4].isdigit():
- return url
- segments[4] += "id_"
- return "/".join(segments)
-
-
-def test_wayback_direct_url() -> None:
- assert (
- wayback_direct_url("http://fatcat.wiki/thing.pdf")
- == "http://fatcat.wiki/thing.pdf"
- )
- assert (
- wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
- == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
- )
- assert (
- wayback_direct_url(
- "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
- )
- == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
- )
-
-
@api.get(
"/access-redirect/{sha1}.pdf",
operation_id="access_redirect_pdf",
include_in_schema=False,
)
def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
+ """
+ NOTE: DEPRECATED
+ """
fulltext = lookup_fulltext_pdf(sha1)
if not fulltext or not fulltext.access_url:
raise HTTPException(status_code=404, detail="PDF file not found")
@@ -222,6 +201,28 @@ def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) ->
return RedirectResponse(access_url, status_code=302)
+@api.get(
+ "/access/wayback/{timestamp}/{url:path}",
+ operation_id="access_redirect_wayback",
+ include_in_schema=False,
+)
+def access_redirect_wayback(timestamp: int, url: str, request: Request) -> Any:
+ original_url = "/".join(str(request.url).split("/")[6:])
+ access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+ return RedirectResponse(access_url, status_code=302)
+
+
+@api.get(
+ "/access/ia_file/{item}/{file_path:path}",
+ operation_id="access_redirect_ia_file",
+ include_in_schema=False,
+)
+def access_redirect_ia_file(item: str, file_path: str, request: Request) -> Any:
+ original_path = urllib.parse.quote("/".join(str(request.url).split("/")[6:]))
+ access_url = f"https://archive.org/download/{item}/{original_path}"
+ return RedirectResponse(access_url, status_code=302)
+
+
web = APIRouter()
@@ -270,6 +271,7 @@ def load_i18n_templates() -> Any:
# pass-through application settings to be available in templates
templates.env.globals["settings"] = settings
templates.env.globals["babel_numbers"] = babel.numbers
+ templates.env.globals["make_access_redirect_url"] = make_access_redirect_url
d[lang_opt] = templates
return d
@@ -414,6 +416,7 @@ async def favicon() -> Any:
"fatcat_scholar/static/ia-favicon.ico", media_type="image/x-icon"
)
+
@app.get("/sitemap.xml", include_in_schema=False)
async def basic_sitemap() -> Any:
return FileResponse(
diff --git a/tests/test_web.py b/tests/test_web.py
index 6c6632d..fc7ea14 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -102,6 +102,7 @@ def test_basic_search(client: Any, mocker: Any) -> None:
rv = client.get("/zh/search?q=blood")
assert rv.status_code == 200
+
def test_basic_work_landing_page(client: Any, mocker: Any) -> None:
with open("tests/files/elastic_fulltext_get.json") as f:
@@ -122,7 +123,11 @@ def test_basic_work_landing_page(client: Any, mocker: Any) -> None:
rv = client.get("/zh/work/2x5qvct2dnhrbctqa2q2uyut6a")
assert rv.status_code == 200
+
def test_basic_access_redirect(client: Any, mocker: Any) -> None:
+ """
+ NOTE: DEPRECATED
+ """
with open("tests/files/elastic_fulltext_search.json") as f:
elastic_resp = json.loads(f.read())
@@ -135,9 +140,42 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
(200, {}, json.dumps(elastic_resp)),
]
- rv = client.get("/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf", allow_redirects=False)
+ rv = client.get(
+ "/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf",
+ allow_redirects=False,
+ )
assert rv.status_code == 302
- assert rv.headers['Location'] == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
+ assert (
+ rv.headers["Location"]
+ == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
+ )
- rv = client.get("/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf", allow_redirects=False)
+ rv = client.get(
+ "/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf",
+ allow_redirects=False,
+ )
assert rv.status_code == 404
+
+
+def test_access_redirects(client: Any, mocker: Any) -> None:
+
+ # tricky "URL encoding in archive.org path" case
+ rv = client.get(
+ "/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 302
+ assert (
+ rv.headers["Location"]
+ == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf"
+ )
+
+ rv = client.get(
+ "/access/wayback/20170814015956/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 302
+ assert (
+ rv.headers["Location"]
+ == "https://web.archive.org/web/20170814015956id_/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf"
+ )