web: access_redirect_fallback mechanism

This adds a helper code path that "tries harder" to find an access link, by querying the fatcat API directly to look for any file from any release associated with the work. If it finds a match, it does the redirect as usual (but does log the incident). If no match can be found, there is now a more helpful access-specific 404 error page. If the *work* is a 404, the generic error page is shown.
author: Bryan Newbold <bnewbold@archive.org> 2021-07-26 20:52:56 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-07-26 20:52:58 -0700
commit: 7ebcdfebdf4e1c8b69026a24cafe500c67dbc384 (patch)
tree: ab580d4307acc5d10b5a8669d28ac6a6ec6e81e5
parent: eeb456c16d016d8523023f787597efae7a6317b9 (diff)
download: fatcat-scholar-7ebcdfebdf4e1c8b69026a24cafe500c67dbc384.tar.gz
fatcat-scholar-7ebcdfebdf4e1c8b69026a24cafe500c67dbc384.zip
3 files changed, 297 insertions, 67 deletions
diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html
new file mode 100644
index 0000000..d058186
--- /dev/null
+++ b/fatcat_scholar/templates/access_404.html
@@ -0,0 +1,35 @@
+{% extends "base.html" %}
+
+{% block title %}
+404 - {{ super() }}
+{% endblock %}
+
+{% block main %}
+<div class="ui icon error message">
+  <div class="content">
+    <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div>
+    <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %}
+    <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %}
+    <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %}
+    <ul>
+      {% if original_url %}
+        <li>{% trans %}Original web url:{% endtrans %}
+          <br>
+          <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code>
+        </li>
+        <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a>
+      {% endif %}
+      {% if archiveorg_path %}
+        <li>{% trans %}archive.org download link for the item:{% endtrans %}
+          {% set archiveorg_url="https://archive.org/download" + archiveorg_path %}
+          <br>
+          <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code>
+      {% endif %}
+      {% if work_ident %}
+        <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a>
+        <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a>
+      {% endif %}
+    </ul>
+  </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index b5af18e..a705e20 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -20,6 +20,7 @@ from fastapi.responses import (
     RedirectResponse,
 )
 from fastapi.middleware.cors import CORSMiddleware
+import fatcat_openapi_client
 import sentry_sdk
 from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
 from starlette_prometheus import metrics, PrometheusMiddleware
@@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
     return doc
 
 
-@api.get(
-    "/work/{work_ident}/access/wayback/{url:path}",
-    operation_id="access_redirect_wayback",
-    include_in_schema=False,
-)
-def access_redirect_wayback(
-    url: str,
-    request: Request,
-    work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
-    raw_original_url = "/".join(str(request.url).split("/")[7:])
-    # the quote() call is necessary because the URL is un-encoded in the path parameter
-    # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
-    original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
-    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
-    if not doc_dict:
-        raise HTTPException(status_code=404, detail="work not found")
-    doc: ScholarDoc = doc_dict["_obj"]
-    # combine fulltext with all access options
-    access: List[Any] = []
-    if doc.fulltext:
-        access.append(doc.fulltext)
-    access.extend(doc.access or [])
-    for opt in access:
-        if (
-            opt.access_type == "wayback"
-            and opt.access_url
-            and "://web.archive.org/web/" in opt.access_url
-            and opt.access_url.endswith(original_url)
-        ):
-            timestamp = opt.access_url.split("/")[4]
-            if not (len(timestamp) == 14 and timestamp.isdigit()):
-                continue
-            access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
-            return RedirectResponse(access_url, status_code=302)
-    raise HTTPException(status_code=404, detail="access URL not found")
-
-
-@api.get(
-    "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
-    operation_id="access_redirect_ia_file",
-    include_in_schema=False,
-)
-def access_redirect_ia_file(
-    item: str,
-    file_path: str,
-    request: Request,
-    work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
-    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
-    access_url = f"https://archive.org/download/{item}/{original_path}"
-    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
-    if not doc_dict:
-        raise HTTPException(status_code=404, detail="work not found")
-    doc: ScholarDoc = doc_dict["_obj"]
-    # combine fulltext with all access options
-    access: List[Any] = []
-    if doc.fulltext:
-        access.append(doc.fulltext)
-    access.extend(doc.access or [])
-    for opt in access:
-        if opt.access_type == "ia_file" and opt.access_url == access_url:
-            return RedirectResponse(access_url, status_code=302)
-    raise HTTPException(status_code=404, detail="access URL not found")
-
-
 web = APIRouter()
 
 
@@ -413,6 +348,165 @@ def web_work(
     )
 
 
+def access_redirect_fallback(
+    request: Request,
+    work_ident: str,
+    original_url: Optional[str] = None,
+    archiveorg_path: Optional[str] = None,
+) -> Any:
+    """
+    The purpose of this helper is to catch access redirects which would
+    otherwise return a 404, and "try harder" to find a redirect.
+    """
+    # lookup against the live fatcat API, instead of scholar ES index
+    api_conf = fatcat_openapi_client.Configuration()
+    api_conf.host = settings.FATCAT_API_HOST
+    api_client = fatcat_openapi_client.DefaultApi(
+        fatcat_openapi_client.ApiClient(api_conf)
+    )
+
+    # fetch list of releases for this work from current fatcat catalog. note
+    # that these releases are not expanded (don't include file entities)
+    try:
+        # fetch work entity itself to fail fast (true 404) and handle redirects
+        work_entity = api_client.get_work(work_ident)
+        logger.warning(
+            f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}"
+        )
+        if work_entity.redirect:
+            work_ident = work_entity.redirect
+        partial_releases = api_client.get_work_releases(
+            ident=work_ident, hide="abstracts,references",
+        )
+    except fatcat_openapi_client.ApiException as ae:
+        raise HTTPException(
+            status_code=ae.status,
+            detail=f"Fatcat API call failed for work_{work_ident}",
+        )
+
+    # for each release, check for any archive.org access option with the given context
+    for partial in partial_releases:
+        release = api_client.get_release(
+            partial.ident,
+            expand="files",
+            # TODO: expand="files,filesets,webcaptures",
+            hide="abstracts,references",
+        )
+        if not release.files:
+            continue
+        for fe in release.files:
+            for url_pair in fe.urls:
+                access_url = url_pair.url
+                if (
+                    original_url
+                    and "://web.archive.org/web/" in access_url
+                    and access_url.endswith(original_url)
+                ):
+                    # TODO: test/verify this
+                    timestamp = access_url.split("/")[4]
+                    # if not (len(timestamp) == 14 and timestamp.isdigit()):
+                    #    continue
+                    replay_url = (
+                        f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+                    )
+                    return RedirectResponse(replay_url, status_code=302)
+                elif (
+                    archiveorg_path
+                    and "://archive.org/" in access_url
+                    and archiveorg_path in access_url
+                ):
+                    return RedirectResponse(access_url, status_code=302)
+
+    # give up and show an error page
+    lang = LangPrefix(request)
+    return i18n_templates[lang.code].TemplateResponse(
+        "access_404.html",
+        {
+            "request": request,
+            "locale": lang.code,
+            "lang_prefix": lang.prefix,
+            "work_ident": work_ident,
+            "original_url": original_url,
+            "archiveorg_path": archiveorg_path,
+        },
+        status_code=404,
+    )
+
+
+@web.get(
+    "/work/{work_ident}/access/wayback/{url:path}",
+    operation_id="access_redirect_wayback",
+    include_in_schema=False,
+)
+def access_redirect_wayback(
+    url: str,
+    request: Request,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+    raw_original_url = "/".join(str(request.url).split("/")[7:])
+    # the quote() call is necessary because the URL is un-encoded in the path parameter
+    # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
+    original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc_dict:
+        return access_redirect_fallback(
+            request, work_ident=work_ident, original_url=original_url
+        )
+    doc: ScholarDoc = doc_dict["_obj"]
+    # combine fulltext with all access options
+    access: List[Any] = []
+    if doc.fulltext:
+        access.append(doc.fulltext)
+    access.extend(doc.access or [])
+    for opt in access:
+        if (
+            opt.access_type == "wayback"
+            and opt.access_url
+            and "://web.archive.org/web/" in opt.access_url
+            and opt.access_url.endswith(original_url)
+        ):
+            timestamp = opt.access_url.split("/")[4]
+            if not (len(timestamp) == 14 and timestamp.isdigit()):
+                continue
+            access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+            return RedirectResponse(access_url, status_code=302)
+    return access_redirect_fallback(
+        request, work_ident=work_ident, original_url=original_url
+    )
+
+
+@web.get(
+    "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
+    operation_id="access_redirect_ia_file",
+    include_in_schema=False,
+)
+def access_redirect_ia_file(
+    item: str,
+    file_path: str,
+    request: Request,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
+    access_url = f"https://archive.org/download/{item}/{original_path}"
+    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc_dict:
+        return access_redirect_fallback(
+            request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+        )
+    doc: ScholarDoc = doc_dict["_obj"]
+    # combine fulltext with all access options
+    access: List[Any] = []
+    if doc.fulltext:
+        access.append(doc.fulltext)
+    access.extend(doc.access or [])
+    for opt in access:
+        if opt.access_type == "ia_file" and opt.access_url == access_url:
+            return RedirectResponse(access_url, status_code=302)
+    return access_redirect_fallback(
+        request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+    )
+
+
 app = FastAPI(
     title="Fatcat Scholar",
     description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",
diff --git a/tests/test_web.py b/tests/test_web.py
index 7f1f72a..d9cfab6 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -3,6 +3,7 @@ from typing import Any
 
 import pytest
 from fastapi.testclient import TestClient
+import fatcat_openapi_client
 
 from fatcat_scholar.web import app
 
@@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
         == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
     )
 
-    # check that URL is validated
+    # check that URL is validated (force fatcat API fallback to fail)
+    fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api")
+    fatcat_api_raw.side_effect = [
+        fatcat_openapi_client.ApiException(status=404, reason="dummy")
+    ]
     rv = client.get(
         "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
         allow_redirects=False,
@@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
     assert rv.status_code == 404
 
 
+def test_access_redirect_fallback(client: Any, mocker: Any) -> None:
+
+    with open("tests/files/elastic_fulltext_get.json") as f:
+        elastic_resp = json.loads(f.read())
+
+    es_raw = mocker.patch(
+        "elasticsearch.connection.Urllib3HttpConnection.perform_request"
+    )
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
+    ]
+    fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work")
+    fatcat_get_work_raw.side_effect = [
+        fatcat_openapi_client.WorkEntity(
+            state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww",
+        )
+    ] * 4
+    fatcat_get_work_releases_raw = mocker.patch(
+        "fatcat_openapi_client.DefaultApi.get_work_releases"
+    )
+    fatcat_get_work_releases_raw.side_effect = [
+        [
+            fatcat_openapi_client.ReleaseEntity(
+                ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+                ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+            ),
+        ]
+    ] * 4
+    fatcat_get_release_raw = mocker.patch(
+        "fatcat_openapi_client.DefaultApi.get_release"
+    )
+    fatcat_get_release_raw.side_effect = [
+        fatcat_openapi_client.ReleaseEntity(
+            state="active",
+            ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+            ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+            files=[
+                fatcat_openapi_client.FileEntity(
+                    ident="ffffffffffffffffffffffffff",
+                    urls=[
+                        fatcat_openapi_client.FileUrl(
+                            rel="web", url="https://blarg.example.com",
+                        ),
+                        fatcat_openapi_client.FileUrl(
+                            rel="webarchive",
+                            url="https://web.archive.org/web/12345/https://example.com",
+                        ),
+                        fatcat_openapi_client.FileUrl(
+                            rel="archive",
+                            url="https://archive.org/download/some/thing.pdf",
+                        ),
+                    ],
+                ),
+            ],
+        )
+    ] * 4
+
+    # redirects should work after API lookup, for both wayback and archive.org
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 302
+    assert (
+        rv.headers["Location"]
+        == "https://web.archive.org/web/12345id_/https://example.com"
+    )
+
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 302
+    assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf"
+
+    # wrong URLs should still not work, but display a page with helpful links
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 404
+    assert b"Access Location Not Found" in rv.content
+    assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content
+
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 404
+    assert b"Access Location Not Found" in rv.content
+    assert b"archive.org/download/some/thing.else.pdf" in rv.content
+
+
 def test_access_redirect_encoding(client: Any, mocker: Any) -> None:
 
     with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f:
author	Bryan Newbold <bnewbold@archive.org>	2021-07-26 20:52:56 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-07-26 20:52:58 -0700
commit	7ebcdfebdf4e1c8b69026a24cafe500c67dbc384 (patch)
tree	ab580d4307acc5d10b5a8669d28ac6a6ec6e81e5
parent	eeb456c16d016d8523023f787597efae7a6317b9 (diff)
download	fatcat-scholar-7ebcdfebdf4e1c8b69026a24cafe500c67dbc384.tar.gz fatcat-scholar-7ebcdfebdf4e1c8b69026a24cafe500c67dbc384.zip