2 files changed, 195 insertions, 66 deletions
diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html
new file mode 100644
index 0000000..d058186
--- /dev/null
+++ b/fatcat_scholar/templates/access_404.html
@@ -0,0 +1,35 @@
+{% extends "base.html" %}
+
+{% block title %}
+404 - {{ super() }}
+{% endblock %}
+
+{% block main %}
+<div class="ui icon error message">
+  <div class="content">
+    <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div>
+    <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %}
+    <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %}
+    <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %}
+    <ul>
+      {% if original_url %}
+        <li>{% trans %}Original web url:{% endtrans %}
+          <br>
+          <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code>
+        </li>
+        <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a>
+      {% endif %}
+      {% if archiveorg_path %}
+        <li>{% trans %}archive.org download link for the item:{% endtrans %}
+          {% set archiveorg_url="https://archive.org/download" + archiveorg_path %}
+          <br>
+          <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code>
+      {% endif %}
+      {% if work_ident %}
+        <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a>
+        <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a>
+      {% endif %}
+    </ul>
+  </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index b5af18e..a705e20 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -20,6 +20,7 @@ from fastapi.responses import (
     RedirectResponse,
 )
 from fastapi.middleware.cors import CORSMiddleware
+import fatcat_openapi_client
 import sentry_sdk
 from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
 from starlette_prometheus import metrics, PrometheusMiddleware
@@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
     return doc
 
 
-@api.get(
-    "/work/{work_ident}/access/wayback/{url:path}",
-    operation_id="access_redirect_wayback",
-    include_in_schema=False,
-)
-def access_redirect_wayback(
-    url: str,
-    request: Request,
-    work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
-    raw_original_url = "/".join(str(request.url).split("/")[7:])
-    # the quote() call is necessary because the URL is un-encoded in the path parameter
-    # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
-    original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
-    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
-    if not doc_dict:
-        raise HTTPException(status_code=404, detail="work not found")
-    doc: ScholarDoc = doc_dict["_obj"]
-    # combine fulltext with all access options
-    access: List[Any] = []
-    if doc.fulltext:
-        access.append(doc.fulltext)
-    access.extend(doc.access or [])
-    for opt in access:
-        if (
-            opt.access_type == "wayback"
-            and opt.access_url
-            and "://web.archive.org/web/" in opt.access_url
-            and opt.access_url.endswith(original_url)
-        ):
-            timestamp = opt.access_url.split("/")[4]
-            if not (len(timestamp) == 14 and timestamp.isdigit()):
-                continue
-            access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
-            return RedirectResponse(access_url, status_code=302)
-    raise HTTPException(status_code=404, detail="access URL not found")
-
-
-@api.get(
-    "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
-    operation_id="access_redirect_ia_file",
-    include_in_schema=False,
-)
-def access_redirect_ia_file(
-    item: str,
-    file_path: str,
-    request: Request,
-    work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
-    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
-    access_url = f"https://archive.org/download/{item}/{original_path}"
-    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
-    if not doc_dict:
-        raise HTTPException(status_code=404, detail="work not found")
-    doc: ScholarDoc = doc_dict["_obj"]
-    # combine fulltext with all access options
-    access: List[Any] = []
-    if doc.fulltext:
-        access.append(doc.fulltext)
-    access.extend(doc.access or [])
-    for opt in access:
-        if opt.access_type == "ia_file" and opt.access_url == access_url:
-            return RedirectResponse(access_url, status_code=302)
-    raise HTTPException(status_code=404, detail="access URL not found")
-
-
 web = APIRouter()
 
 
@@ -413,6 +348,165 @@ def web_work(
     )
 
 
+def access_redirect_fallback(
+    request: Request,
+    work_ident: str,
+    original_url: Optional[str] = None,
+    archiveorg_path: Optional[str] = None,
+) -> Any:
+    """
+    The purpose of this helper is to catch access redirects which would
+    otherwise return a 404, and "try harder" to find a redirect.
+    """
+    # lookup against the live fatcat API, instead of scholar ES index
+    api_conf = fatcat_openapi_client.Configuration()
+    api_conf.host = settings.FATCAT_API_HOST
+    api_client = fatcat_openapi_client.DefaultApi(
+        fatcat_openapi_client.ApiClient(api_conf)
+    )
+
+    # fetch list of releases for this work from current fatcat catalog. note
+    # that these releases are not expanded (don't include file entities)
+    try:
+        # fetch work entity itself to fail fast (true 404) and handle redirects
+        work_entity = api_client.get_work(work_ident)
+        logger.warning(
+            f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}"
+        )
+        if work_entity.redirect:
+            work_ident = work_entity.redirect
+        partial_releases = api_client.get_work_releases(
+            ident=work_ident, hide="abstracts,references",
+        )
+    except fatcat_openapi_client.ApiException as ae:
+        raise HTTPException(
+            status_code=ae.status,
+            detail=f"Fatcat API call failed for work_{work_ident}",
+        )
+
+    # for each release, check for any archive.org access option with the given context
+    for partial in partial_releases:
+        release = api_client.get_release(
+            partial.ident,
+            expand="files",
+            # TODO: expand="files,filesets,webcaptures",
+            hide="abstracts,references",
+        )
+        if not release.files:
+            continue
+        for fe in release.files:
+            for url_pair in fe.urls:
+                access_url = url_pair.url
+                if (
+                    original_url
+                    and "://web.archive.org/web/" in access_url
+                    and access_url.endswith(original_url)
+                ):
+                    # TODO: test/verify this
+                    timestamp = access_url.split("/")[4]
+                    # if not (len(timestamp) == 14 and timestamp.isdigit()):
+                    #    continue
+                    replay_url = (
+                        f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+                    )
+                    return RedirectResponse(replay_url, status_code=302)
+                elif (
+                    archiveorg_path
+                    and "://archive.org/" in access_url
+                    and archiveorg_path in access_url
+                ):
+                    return RedirectResponse(access_url, status_code=302)
+
+    # give up and show an error page
+    lang = LangPrefix(request)
+    return i18n_templates[lang.code].TemplateResponse(
+        "access_404.html",
+        {
+            "request": request,
+            "locale": lang.code,
+            "lang_prefix": lang.prefix,
+            "work_ident": work_ident,
+            "original_url": original_url,
+            "archiveorg_path": archiveorg_path,
+        },
+        status_code=404,
+    )
+
+
+@web.get(
+    "/work/{work_ident}/access/wayback/{url:path}",
+    operation_id="access_redirect_wayback",
+    include_in_schema=False,
+)
+def access_redirect_wayback(
+    url: str,
+    request: Request,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+    raw_original_url = "/".join(str(request.url).split("/")[7:])
+    # the quote() call is necessary because the URL is un-encoded in the path parameter
+    # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
+    original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc_dict:
+        return access_redirect_fallback(
+            request, work_ident=work_ident, original_url=original_url
+        )
+    doc: ScholarDoc = doc_dict["_obj"]
+    # combine fulltext with all access options
+    access: List[Any] = []
+    if doc.fulltext:
+        access.append(doc.fulltext)
+    access.extend(doc.access or [])
+    for opt in access:
+        if (
+            opt.access_type == "wayback"
+            and opt.access_url
+            and "://web.archive.org/web/" in opt.access_url
+            and opt.access_url.endswith(original_url)
+        ):
+            timestamp = opt.access_url.split("/")[4]
+            if not (len(timestamp) == 14 and timestamp.isdigit()):
+                continue
+            access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+            return RedirectResponse(access_url, status_code=302)
+    return access_redirect_fallback(
+        request, work_ident=work_ident, original_url=original_url
+    )
+
+
+@web.get(
+    "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
+    operation_id="access_redirect_ia_file",
+    include_in_schema=False,
+)
+def access_redirect_ia_file(
+    item: str,
+    file_path: str,
+    request: Request,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
+    access_url = f"https://archive.org/download/{item}/{original_path}"
+    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc_dict:
+        return access_redirect_fallback(
+            request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+        )
+    doc: ScholarDoc = doc_dict["_obj"]
+    # combine fulltext with all access options
+    access: List[Any] = []
+    if doc.fulltext:
+        access.append(doc.fulltext)
+    access.extend(doc.access or [])
+    for opt in access:
+        if opt.access_type == "ia_file" and opt.access_url == access_url:
+            return RedirectResponse(access_url, status_code=302)
+    return access_redirect_fallback(
+        request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+    )
+
+
 app = FastAPI(
     title="Fatcat Scholar",
     description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",