From 7ebcdfebdf4e1c8b69026a24cafe500c67dbc384 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 26 Jul 2021 20:52:56 -0700 Subject: web: access_redirect_fallback mechanism This adds a helper code path that "tries harder" to find an access link, by querying the fatcat API directly to look for any file from any release associated with the work. If it finds a match, it does the redirect as usual (but does log the incident). If no match can be found, there is now a more helpful access-specific 404 error page. If the *work* is a 404, the generic error page is shown. --- fatcat_scholar/templates/access_404.html | 35 +++++ fatcat_scholar/web.py | 226 ++++++++++++++++++++++--------- tests/test_web.py | 103 +++++++++++++- 3 files changed, 297 insertions(+), 67 deletions(-) create mode 100644 fatcat_scholar/templates/access_404.html diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html new file mode 100644 index 0000000..d058186 --- /dev/null +++ b/fatcat_scholar/templates/access_404.html @@ -0,0 +1,35 @@ +{% extends "base.html" %} + +{% block title %} +404 - {{ super() }} +{% endblock %} + +{% block main %} +
+
+
{% trans %}404: Access Location Not Found{% endtrans %}
+

{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %} +

{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %} +

{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %} +

+
+
+{% endblock %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index b5af18e..a705e20 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -20,6 +20,7 @@ from fastapi.responses import ( RedirectResponse, ) from fastapi.middleware.cors import CORSMiddleware +import fatcat_openapi_client import sentry_sdk from sentry_sdk.integrations.asgi import SentryAsgiMiddleware from starlette_prometheus import metrics, PrometheusMiddleware @@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict return doc -@api.get( - "/work/{work_ident}/access/wayback/{url:path}", - operation_id="access_redirect_wayback", - include_in_schema=False, -) -def access_redirect_wayback( - url: str, - request: Request, - work_ident: str = Query(..., min_length=20, max_length=20), -) -> Any: - raw_original_url = "/".join(str(request.url).split("/")[7:]) - # the quote() call is necessary because the URL is un-encoded in the path parameter - # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d - original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) - doc_dict = get_es_scholar_doc(f"work_{work_ident}") - if not doc_dict: - raise HTTPException(status_code=404, detail="work not found") - doc: ScholarDoc = doc_dict["_obj"] - # combine fulltext with all access options - access: List[Any] = [] - if doc.fulltext: - access.append(doc.fulltext) - access.extend(doc.access or []) - for opt in access: - if ( - opt.access_type == "wayback" - and opt.access_url - and "://web.archive.org/web/" in opt.access_url - and opt.access_url.endswith(original_url) - ): - timestamp = opt.access_url.split("/")[4] - if not (len(timestamp) == 14 and timestamp.isdigit()): - continue - access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" - return RedirectResponse(access_url, status_code=302) - raise HTTPException(status_code=404, detail="access URL not found") - - -@api.get( - "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", - operation_id="access_redirect_ia_file", - include_in_schema=False, -) -def access_redirect_ia_file( - item: str, - file_path: str, - request: Request, - work_ident: str = Query(..., min_length=20, max_length=20), -) -> Any: - original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) - access_url = f"https://archive.org/download/{item}/{original_path}" - doc_dict = get_es_scholar_doc(f"work_{work_ident}") - if not doc_dict: - raise HTTPException(status_code=404, detail="work not found") - doc: ScholarDoc = doc_dict["_obj"] - # combine fulltext with all access options - access: List[Any] = [] - if doc.fulltext: - access.append(doc.fulltext) - access.extend(doc.access or []) - for opt in access: - if opt.access_type == "ia_file" and opt.access_url == access_url: - return RedirectResponse(access_url, status_code=302) - raise HTTPException(status_code=404, detail="access URL not found") - - web = APIRouter() @@ -413,6 +348,165 @@ def web_work( ) +def access_redirect_fallback( + request: Request, + work_ident: str, + original_url: Optional[str] = None, + archiveorg_path: Optional[str] = None, +) -> Any: + """ + The purpose of this helper is to catch access redirects which would + otherwise return a 404, and "try harder" to find a redirect. + """ + # lookup against the live fatcat API, instead of scholar ES index + api_conf = fatcat_openapi_client.Configuration() + api_conf.host = settings.FATCAT_API_HOST + api_client = fatcat_openapi_client.DefaultApi( + fatcat_openapi_client.ApiClient(api_conf) + ) + + # fetch list of releases for this work from current fatcat catalog. note + # that these releases are not expanded (don't include file entities) + try: + # fetch work entity itself to fail fast (true 404) and handle redirects + work_entity = api_client.get_work(work_ident) + logger.warning( + f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}" + ) + if work_entity.redirect: + work_ident = work_entity.redirect + partial_releases = api_client.get_work_releases( + ident=work_ident, hide="abstracts,references", + ) + except fatcat_openapi_client.ApiException as ae: + raise HTTPException( + status_code=ae.status, + detail=f"Fatcat API call failed for work_{work_ident}", + ) + + # for each release, check for any archive.org access option with the given context + for partial in partial_releases: + release = api_client.get_release( + partial.ident, + expand="files", + # TODO: expand="files,filesets,webcaptures", + hide="abstracts,references", + ) + if not release.files: + continue + for fe in release.files: + for url_pair in fe.urls: + access_url = url_pair.url + if ( + original_url + and "://web.archive.org/web/" in access_url + and access_url.endswith(original_url) + ): + # TODO: test/verify this + timestamp = access_url.split("/")[4] + # if not (len(timestamp) == 14 and timestamp.isdigit()): + # continue + replay_url = ( + f"https://web.archive.org/web/{timestamp}id_/{original_url}" + ) + return RedirectResponse(replay_url, status_code=302) + elif ( + archiveorg_path + and "://archive.org/" in access_url + and archiveorg_path in access_url + ): + return RedirectResponse(access_url, status_code=302) + + # give up and show an error page + lang = LangPrefix(request) + return i18n_templates[lang.code].TemplateResponse( + "access_404.html", + { + "request": request, + "locale": lang.code, + "lang_prefix": lang.prefix, + "work_ident": work_ident, + "original_url": original_url, + "archiveorg_path": archiveorg_path, + }, + status_code=404, + ) + + +@web.get( + "/work/{work_ident}/access/wayback/{url:path}", + operation_id="access_redirect_wayback", + include_in_schema=False, +) +def access_redirect_wayback( + url: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + raw_original_url = "/".join(str(request.url).split("/")[7:]) + # the quote() call is necessary because the URL is un-encoded in the path parameter + # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d + original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + return access_redirect_fallback( + request, work_ident=work_ident, original_url=original_url + ) + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if ( + opt.access_type == "wayback" + and opt.access_url + and "://web.archive.org/web/" in opt.access_url + and opt.access_url.endswith(original_url) + ): + timestamp = opt.access_url.split("/")[4] + if not (len(timestamp) == 14 and timestamp.isdigit()): + continue + access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" + return RedirectResponse(access_url, status_code=302) + return access_redirect_fallback( + request, work_ident=work_ident, original_url=original_url + ) + + +@web.get( + "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", + operation_id="access_redirect_ia_file", + include_in_schema=False, +) +def access_redirect_ia_file( + item: str, + file_path: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) + access_url = f"https://archive.org/download/{item}/{original_path}" + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + return access_redirect_fallback( + request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}" + ) + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if opt.access_type == "ia_file" and opt.access_url == access_url: + return RedirectResponse(access_url, status_code=302) + return access_redirect_fallback( + request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}" + ) + + app = FastAPI( title="Fatcat Scholar", description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.", diff --git a/tests/test_web.py b/tests/test_web.py index 7f1f72a..d9cfab6 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -3,6 +3,7 @@ from typing import Any import pytest from fastapi.testclient import TestClient +import fatcat_openapi_client from fatcat_scholar.web import app @@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" ) - # check that URL is validated + # check that URL is validated (force fatcat API fallback to fail) + fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api") + fatcat_api_raw.side_effect = [ + fatcat_openapi_client.ApiException(status=404, reason="dummy") + ] rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, @@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: assert rv.status_code == 404 +def test_access_redirect_fallback(client: Any, mocker: Any) -> None: + + with open("tests/files/elastic_fulltext_get.json") as f: + elastic_resp = json.loads(f.read()) + + es_raw = mocker.patch( + "elasticsearch.connection.Urllib3HttpConnection.perform_request" + ) + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + ] + fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work") + fatcat_get_work_raw.side_effect = [ + fatcat_openapi_client.WorkEntity( + state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww", + ) + ] * 4 + fatcat_get_work_releases_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_work_releases" + ) + fatcat_get_work_releases_raw.side_effect = [ + [ + fatcat_openapi_client.ReleaseEntity( + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + ), + ] + ] * 4 + fatcat_get_release_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_release" + ) + fatcat_get_release_raw.side_effect = [ + fatcat_openapi_client.ReleaseEntity( + state="active", + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + files=[ + fatcat_openapi_client.FileEntity( + ident="ffffffffffffffffffffffffff", + urls=[ + fatcat_openapi_client.FileUrl( + rel="web", url="https://blarg.example.com", + ), + fatcat_openapi_client.FileUrl( + rel="webarchive", + url="https://web.archive.org/web/12345/https://example.com", + ), + fatcat_openapi_client.FileUrl( + rel="archive", + url="https://archive.org/download/some/thing.pdf", + ), + ], + ), + ], + ) + ] * 4 + + # redirects should work after API lookup, for both wayback and archive.org + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert ( + rv.headers["Location"] + == "https://web.archive.org/web/12345id_/https://example.com" + ) + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf" + + # wrong URLs should still not work, but display a page with helpful links + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"archive.org/download/some/thing.else.pdf" in rv.content + + def test_access_redirect_encoding(client: Any, mocker: Any) -> None: with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: -- cgit v1.2.3