diff options
-rw-r--r-- | fatcat_scholar/templates/access_404.html | 35 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 226 | ||||
-rw-r--r-- | tests/test_web.py | 103 |
3 files changed, 297 insertions, 67 deletions
diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html new file mode 100644 index 0000000..d058186 --- /dev/null +++ b/fatcat_scholar/templates/access_404.html @@ -0,0 +1,35 @@ +{% extends "base.html" %} + +{% block title %} +404 - {{ super() }} +{% endblock %} + +{% block main %} +<div class="ui icon error message"> + <div class="content"> + <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div> + <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %} + <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %} + <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %} + <ul> + {% if original_url %} + <li>{% trans %}Original web url:{% endtrans %} + <br> + <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code> + </li> + <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a> + {% endif %} + {% if archiveorg_path %} + <li>{% trans %}archive.org download link for the item:{% endtrans %} + {% set archiveorg_url="https://archive.org/download" + archiveorg_path %} + <br> + <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code> + {% endif %} + {% if work_ident %} + <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a> + <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a> + {% endif %} + </ul> + </div> +</div> +{% endblock %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index b5af18e..a705e20 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -20,6 +20,7 @@ from fastapi.responses import ( RedirectResponse, ) from fastapi.middleware.cors import CORSMiddleware +import fatcat_openapi_client import sentry_sdk from sentry_sdk.integrations.asgi import SentryAsgiMiddleware from starlette_prometheus import metrics, PrometheusMiddleware @@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict return doc -@api.get( - "/work/{work_ident}/access/wayback/{url:path}", - operation_id="access_redirect_wayback", - include_in_schema=False, -) -def access_redirect_wayback( - url: str, - request: Request, - work_ident: str = Query(..., min_length=20, max_length=20), -) -> Any: - raw_original_url = "/".join(str(request.url).split("/")[7:]) - # the quote() call is necessary because the URL is un-encoded in the path parameter - # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d - original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) - doc_dict = get_es_scholar_doc(f"work_{work_ident}") - if not doc_dict: - raise HTTPException(status_code=404, detail="work not found") - doc: ScholarDoc = doc_dict["_obj"] - # combine fulltext with all access options - access: List[Any] = [] - if doc.fulltext: - access.append(doc.fulltext) - access.extend(doc.access or []) - for opt in access: - if ( - opt.access_type == "wayback" - and opt.access_url - and "://web.archive.org/web/" in opt.access_url - and opt.access_url.endswith(original_url) - ): - timestamp = opt.access_url.split("/")[4] - if not (len(timestamp) == 14 and timestamp.isdigit()): - continue - access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" - return RedirectResponse(access_url, status_code=302) - raise HTTPException(status_code=404, detail="access URL not found") - - -@api.get( - "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", - operation_id="access_redirect_ia_file", - include_in_schema=False, -) -def access_redirect_ia_file( - item: str, - file_path: str, - request: Request, - work_ident: str = Query(..., min_length=20, max_length=20), -) -> Any: - original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) - access_url = f"https://archive.org/download/{item}/{original_path}" - doc_dict = get_es_scholar_doc(f"work_{work_ident}") - if not doc_dict: - raise HTTPException(status_code=404, detail="work not found") - doc: ScholarDoc = doc_dict["_obj"] - # combine fulltext with all access options - access: List[Any] = [] - if doc.fulltext: - access.append(doc.fulltext) - access.extend(doc.access or []) - for opt in access: - if opt.access_type == "ia_file" and opt.access_url == access_url: - return RedirectResponse(access_url, status_code=302) - raise HTTPException(status_code=404, detail="access URL not found") - - web = APIRouter() @@ -413,6 +348,165 @@ def web_work( ) +def access_redirect_fallback( + request: Request, + work_ident: str, + original_url: Optional[str] = None, + archiveorg_path: Optional[str] = None, +) -> Any: + """ + The purpose of this helper is to catch access redirects which would + otherwise return a 404, and "try harder" to find a redirect. + """ + # lookup against the live fatcat API, instead of scholar ES index + api_conf = fatcat_openapi_client.Configuration() + api_conf.host = settings.FATCAT_API_HOST + api_client = fatcat_openapi_client.DefaultApi( + fatcat_openapi_client.ApiClient(api_conf) + ) + + # fetch list of releases for this work from current fatcat catalog. note + # that these releases are not expanded (don't include file entities) + try: + # fetch work entity itself to fail fast (true 404) and handle redirects + work_entity = api_client.get_work(work_ident) + logger.warning( + f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}" + ) + if work_entity.redirect: + work_ident = work_entity.redirect + partial_releases = api_client.get_work_releases( + ident=work_ident, hide="abstracts,references", + ) + except fatcat_openapi_client.ApiException as ae: + raise HTTPException( + status_code=ae.status, + detail=f"Fatcat API call failed for work_{work_ident}", + ) + + # for each release, check for any archive.org access option with the given context + for partial in partial_releases: + release = api_client.get_release( + partial.ident, + expand="files", + # TODO: expand="files,filesets,webcaptures", + hide="abstracts,references", + ) + if not release.files: + continue + for fe in release.files: + for url_pair in fe.urls: + access_url = url_pair.url + if ( + original_url + and "://web.archive.org/web/" in access_url + and access_url.endswith(original_url) + ): + # TODO: test/verify this + timestamp = access_url.split("/")[4] + # if not (len(timestamp) == 14 and timestamp.isdigit()): + # continue + replay_url = ( + f"https://web.archive.org/web/{timestamp}id_/{original_url}" + ) + return RedirectResponse(replay_url, status_code=302) + elif ( + archiveorg_path + and "://archive.org/" in access_url + and archiveorg_path in access_url + ): + return RedirectResponse(access_url, status_code=302) + + # give up and show an error page + lang = LangPrefix(request) + return i18n_templates[lang.code].TemplateResponse( + "access_404.html", + { + "request": request, + "locale": lang.code, + "lang_prefix": lang.prefix, + "work_ident": work_ident, + "original_url": original_url, + "archiveorg_path": archiveorg_path, + }, + status_code=404, + ) + + +@web.get( + "/work/{work_ident}/access/wayback/{url:path}", + operation_id="access_redirect_wayback", + include_in_schema=False, +) +def access_redirect_wayback( + url: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + raw_original_url = "/".join(str(request.url).split("/")[7:]) + # the quote() call is necessary because the URL is un-encoded in the path parameter + # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d + original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + return access_redirect_fallback( + request, work_ident=work_ident, original_url=original_url + ) + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if ( + opt.access_type == "wayback" + and opt.access_url + and "://web.archive.org/web/" in opt.access_url + and opt.access_url.endswith(original_url) + ): + timestamp = opt.access_url.split("/")[4] + if not (len(timestamp) == 14 and timestamp.isdigit()): + continue + access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" + return RedirectResponse(access_url, status_code=302) + return access_redirect_fallback( + request, work_ident=work_ident, original_url=original_url + ) + + +@web.get( + "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", + operation_id="access_redirect_ia_file", + include_in_schema=False, +) +def access_redirect_ia_file( + item: str, + file_path: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) + access_url = f"https://archive.org/download/{item}/{original_path}" + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + return access_redirect_fallback( + request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}" + ) + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if opt.access_type == "ia_file" and opt.access_url == access_url: + return RedirectResponse(access_url, status_code=302) + return access_redirect_fallback( + request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}" + ) + + app = FastAPI( title="Fatcat Scholar", description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.", diff --git a/tests/test_web.py b/tests/test_web.py index 7f1f72a..d9cfab6 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -3,6 +3,7 @@ from typing import Any import pytest from fastapi.testclient import TestClient +import fatcat_openapi_client from fatcat_scholar.web import app @@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" ) - # check that URL is validated + # check that URL is validated (force fatcat API fallback to fail) + fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api") + fatcat_api_raw.side_effect = [ + fatcat_openapi_client.ApiException(status=404, reason="dummy") + ] rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, @@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: assert rv.status_code == 404 +def test_access_redirect_fallback(client: Any, mocker: Any) -> None: + + with open("tests/files/elastic_fulltext_get.json") as f: + elastic_resp = json.loads(f.read()) + + es_raw = mocker.patch( + "elasticsearch.connection.Urllib3HttpConnection.perform_request" + ) + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + ] + fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work") + fatcat_get_work_raw.side_effect = [ + fatcat_openapi_client.WorkEntity( + state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww", + ) + ] * 4 + fatcat_get_work_releases_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_work_releases" + ) + fatcat_get_work_releases_raw.side_effect = [ + [ + fatcat_openapi_client.ReleaseEntity( + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + ), + ] + ] * 4 + fatcat_get_release_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_release" + ) + fatcat_get_release_raw.side_effect = [ + fatcat_openapi_client.ReleaseEntity( + state="active", + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + files=[ + fatcat_openapi_client.FileEntity( + ident="ffffffffffffffffffffffffff", + urls=[ + fatcat_openapi_client.FileUrl( + rel="web", url="https://blarg.example.com", + ), + fatcat_openapi_client.FileUrl( + rel="webarchive", + url="https://web.archive.org/web/12345/https://example.com", + ), + fatcat_openapi_client.FileUrl( + rel="archive", + url="https://archive.org/download/some/thing.pdf", + ), + ], + ), + ], + ) + ] * 4 + + # redirects should work after API lookup, for both wayback and archive.org + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert ( + rv.headers["Location"] + == "https://web.archive.org/web/12345id_/https://example.com" + ) + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf" + + # wrong URLs should still not work, but display a page with helpful links + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"archive.org/download/some/thing.else.pdf" in rv.content + + def test_access_redirect_encoding(client: Any, mocker: Any) -> None: with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: |