aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/templates/access_404.html35
-rw-r--r--fatcat_scholar/web.py226
-rw-r--r--tests/test_web.py103
3 files changed, 297 insertions, 67 deletions
diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html
new file mode 100644
index 0000000..d058186
--- /dev/null
+++ b/fatcat_scholar/templates/access_404.html
@@ -0,0 +1,35 @@
+{% extends "base.html" %}
+
+{% block title %}
+404 - {{ super() }}
+{% endblock %}
+
+{% block main %}
+<div class="ui icon error message">
+ <div class="content">
+ <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div>
+ <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %}
+ <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %}
+ <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %}
+ <ul>
+ {% if original_url %}
+ <li>{% trans %}Original web url:{% endtrans %}
+ <br>
+ <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code>
+ </li>
+ <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a>
+ {% endif %}
+ {% if archiveorg_path %}
+ <li>{% trans %}archive.org download link for the item:{% endtrans %}
+ {% set archiveorg_url="https://archive.org/download" + archiveorg_path %}
+ <br>
+ <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code>
+ {% endif %}
+ {% if work_ident %}
+ <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a>
+ <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a>
+ {% endif %}
+ </ul>
+ </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index b5af18e..a705e20 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -20,6 +20,7 @@ from fastapi.responses import (
RedirectResponse,
)
from fastapi.middleware.cors import CORSMiddleware
+import fatcat_openapi_client
import sentry_sdk
from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
from starlette_prometheus import metrics, PrometheusMiddleware
@@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
return doc
-@api.get(
- "/work/{work_ident}/access/wayback/{url:path}",
- operation_id="access_redirect_wayback",
- include_in_schema=False,
-)
-def access_redirect_wayback(
- url: str,
- request: Request,
- work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
- raw_original_url = "/".join(str(request.url).split("/")[7:])
- # the quote() call is necessary because the URL is un-encoded in the path parameter
- # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
- original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
- doc_dict = get_es_scholar_doc(f"work_{work_ident}")
- if not doc_dict:
- raise HTTPException(status_code=404, detail="work not found")
- doc: ScholarDoc = doc_dict["_obj"]
- # combine fulltext with all access options
- access: List[Any] = []
- if doc.fulltext:
- access.append(doc.fulltext)
- access.extend(doc.access or [])
- for opt in access:
- if (
- opt.access_type == "wayback"
- and opt.access_url
- and "://web.archive.org/web/" in opt.access_url
- and opt.access_url.endswith(original_url)
- ):
- timestamp = opt.access_url.split("/")[4]
- if not (len(timestamp) == 14 and timestamp.isdigit()):
- continue
- access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
- return RedirectResponse(access_url, status_code=302)
- raise HTTPException(status_code=404, detail="access URL not found")
-
-
-@api.get(
- "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
- operation_id="access_redirect_ia_file",
- include_in_schema=False,
-)
-def access_redirect_ia_file(
- item: str,
- file_path: str,
- request: Request,
- work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
- original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
- access_url = f"https://archive.org/download/{item}/{original_path}"
- doc_dict = get_es_scholar_doc(f"work_{work_ident}")
- if not doc_dict:
- raise HTTPException(status_code=404, detail="work not found")
- doc: ScholarDoc = doc_dict["_obj"]
- # combine fulltext with all access options
- access: List[Any] = []
- if doc.fulltext:
- access.append(doc.fulltext)
- access.extend(doc.access or [])
- for opt in access:
- if opt.access_type == "ia_file" and opt.access_url == access_url:
- return RedirectResponse(access_url, status_code=302)
- raise HTTPException(status_code=404, detail="access URL not found")
-
-
web = APIRouter()
@@ -413,6 +348,165 @@ def web_work(
)
+def access_redirect_fallback(
+ request: Request,
+ work_ident: str,
+ original_url: Optional[str] = None,
+ archiveorg_path: Optional[str] = None,
+) -> Any:
+ """
+ The purpose of this helper is to catch access redirects which would
+ otherwise return a 404, and "try harder" to find a redirect.
+ """
+ # lookup against the live fatcat API, instead of scholar ES index
+ api_conf = fatcat_openapi_client.Configuration()
+ api_conf.host = settings.FATCAT_API_HOST
+ api_client = fatcat_openapi_client.DefaultApi(
+ fatcat_openapi_client.ApiClient(api_conf)
+ )
+
+ # fetch list of releases for this work from current fatcat catalog. note
+ # that these releases are not expanded (don't include file entities)
+ try:
+ # fetch work entity itself to fail fast (true 404) and handle redirects
+ work_entity = api_client.get_work(work_ident)
+ logger.warning(
+ f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}"
+ )
+ if work_entity.redirect:
+ work_ident = work_entity.redirect
+ partial_releases = api_client.get_work_releases(
+ ident=work_ident, hide="abstracts,references",
+ )
+ except fatcat_openapi_client.ApiException as ae:
+ raise HTTPException(
+ status_code=ae.status,
+ detail=f"Fatcat API call failed for work_{work_ident}",
+ )
+
+ # for each release, check for any archive.org access option with the given context
+ for partial in partial_releases:
+ release = api_client.get_release(
+ partial.ident,
+ expand="files",
+ # TODO: expand="files,filesets,webcaptures",
+ hide="abstracts,references",
+ )
+ if not release.files:
+ continue
+ for fe in release.files:
+ for url_pair in fe.urls:
+ access_url = url_pair.url
+ if (
+ original_url
+ and "://web.archive.org/web/" in access_url
+ and access_url.endswith(original_url)
+ ):
+ # TODO: test/verify this
+ timestamp = access_url.split("/")[4]
+ # if not (len(timestamp) == 14 and timestamp.isdigit()):
+ # continue
+ replay_url = (
+ f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+ )
+ return RedirectResponse(replay_url, status_code=302)
+ elif (
+ archiveorg_path
+ and "://archive.org/" in access_url
+ and archiveorg_path in access_url
+ ):
+ return RedirectResponse(access_url, status_code=302)
+
+ # give up and show an error page
+ lang = LangPrefix(request)
+ return i18n_templates[lang.code].TemplateResponse(
+ "access_404.html",
+ {
+ "request": request,
+ "locale": lang.code,
+ "lang_prefix": lang.prefix,
+ "work_ident": work_ident,
+ "original_url": original_url,
+ "archiveorg_path": archiveorg_path,
+ },
+ status_code=404,
+ )
+
+
+@web.get(
+ "/work/{work_ident}/access/wayback/{url:path}",
+ operation_id="access_redirect_wayback",
+ include_in_schema=False,
+)
+def access_redirect_wayback(
+ url: str,
+ request: Request,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+ raw_original_url = "/".join(str(request.url).split("/")[7:])
+ # the quote() call is necessary because the URL is un-encoded in the path parameter
+ # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
+ original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+ doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc_dict:
+ return access_redirect_fallback(
+ request, work_ident=work_ident, original_url=original_url
+ )
+ doc: ScholarDoc = doc_dict["_obj"]
+ # combine fulltext with all access options
+ access: List[Any] = []
+ if doc.fulltext:
+ access.append(doc.fulltext)
+ access.extend(doc.access or [])
+ for opt in access:
+ if (
+ opt.access_type == "wayback"
+ and opt.access_url
+ and "://web.archive.org/web/" in opt.access_url
+ and opt.access_url.endswith(original_url)
+ ):
+ timestamp = opt.access_url.split("/")[4]
+ if not (len(timestamp) == 14 and timestamp.isdigit()):
+ continue
+ access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+ return RedirectResponse(access_url, status_code=302)
+ return access_redirect_fallback(
+ request, work_ident=work_ident, original_url=original_url
+ )
+
+
+@web.get(
+ "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
+ operation_id="access_redirect_ia_file",
+ include_in_schema=False,
+)
+def access_redirect_ia_file(
+ item: str,
+ file_path: str,
+ request: Request,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+ original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
+ access_url = f"https://archive.org/download/{item}/{original_path}"
+ doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc_dict:
+ return access_redirect_fallback(
+ request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+ )
+ doc: ScholarDoc = doc_dict["_obj"]
+ # combine fulltext with all access options
+ access: List[Any] = []
+ if doc.fulltext:
+ access.append(doc.fulltext)
+ access.extend(doc.access or [])
+ for opt in access:
+ if opt.access_type == "ia_file" and opt.access_url == access_url:
+ return RedirectResponse(access_url, status_code=302)
+ return access_redirect_fallback(
+ request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+ )
+
+
app = FastAPI(
title="Fatcat Scholar",
description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",
diff --git a/tests/test_web.py b/tests/test_web.py
index 7f1f72a..d9cfab6 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -3,6 +3,7 @@ from typing import Any
import pytest
from fastapi.testclient import TestClient
+import fatcat_openapi_client
from fatcat_scholar.web import app
@@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
== "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
)
- # check that URL is validated
+ # check that URL is validated (force fatcat API fallback to fail)
+ fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api")
+ fatcat_api_raw.side_effect = [
+ fatcat_openapi_client.ApiException(status=404, reason="dummy")
+ ]
rv = client.get(
"/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
allow_redirects=False,
@@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
assert rv.status_code == 404
+def test_access_redirect_fallback(client: Any, mocker: Any) -> None:
+
+ with open("tests/files/elastic_fulltext_get.json") as f:
+ elastic_resp = json.loads(f.read())
+
+ es_raw = mocker.patch(
+ "elasticsearch.connection.Urllib3HttpConnection.perform_request"
+ )
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ ]
+ fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work")
+ fatcat_get_work_raw.side_effect = [
+ fatcat_openapi_client.WorkEntity(
+ state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww",
+ )
+ ] * 4
+ fatcat_get_work_releases_raw = mocker.patch(
+ "fatcat_openapi_client.DefaultApi.get_work_releases"
+ )
+ fatcat_get_work_releases_raw.side_effect = [
+ [
+ fatcat_openapi_client.ReleaseEntity(
+ ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+ ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+ ),
+ ]
+ ] * 4
+ fatcat_get_release_raw = mocker.patch(
+ "fatcat_openapi_client.DefaultApi.get_release"
+ )
+ fatcat_get_release_raw.side_effect = [
+ fatcat_openapi_client.ReleaseEntity(
+ state="active",
+ ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+ ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+ files=[
+ fatcat_openapi_client.FileEntity(
+ ident="ffffffffffffffffffffffffff",
+ urls=[
+ fatcat_openapi_client.FileUrl(
+ rel="web", url="https://blarg.example.com",
+ ),
+ fatcat_openapi_client.FileUrl(
+ rel="webarchive",
+ url="https://web.archive.org/web/12345/https://example.com",
+ ),
+ fatcat_openapi_client.FileUrl(
+ rel="archive",
+ url="https://archive.org/download/some/thing.pdf",
+ ),
+ ],
+ ),
+ ],
+ )
+ ] * 4
+
+ # redirects should work after API lookup, for both wayback and archive.org
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 302
+ assert (
+ rv.headers["Location"]
+ == "https://web.archive.org/web/12345id_/https://example.com"
+ )
+
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 302
+ assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf"
+
+ # wrong URLs should still not work, but display a page with helpful links
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 404
+ assert b"Access Location Not Found" in rv.content
+ assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content
+
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 404
+ assert b"Access Location Not Found" in rv.content
+ assert b"archive.org/download/some/thing.else.pdf" in rv.content
+
+
def test_access_redirect_encoding(client: Any, mocker: Any) -> None:
with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: