From 7ebcdfebdf4e1c8b69026a24cafe500c67dbc384 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 26 Jul 2021 20:52:56 -0700 Subject: web: access_redirect_fallback mechanism This adds a helper code path that "tries harder" to find an access link, by querying the fatcat API directly to look for any file from any release associated with the work. If it finds a match, it does the redirect as usual (but does log the incident). If no match can be found, there is now a more helpful access-specific 404 error page. If the *work* is a 404, the generic error page is shown. --- tests/test_web.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 1 deletion(-) (limited to 'tests') diff --git a/tests/test_web.py b/tests/test_web.py index 7f1f72a..d9cfab6 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -3,6 +3,7 @@ from typing import Any import pytest from fastapi.testclient import TestClient +import fatcat_openapi_client from fatcat_scholar.web import app @@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" ) - # check that URL is validated + # check that URL is validated (force fatcat API fallback to fail) + fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api") + fatcat_api_raw.side_effect = [ + fatcat_openapi_client.ApiException(status=404, reason="dummy") + ] rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, @@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: assert rv.status_code == 404 +def test_access_redirect_fallback(client: Any, mocker: Any) -> None: + + with open("tests/files/elastic_fulltext_get.json") as f: + elastic_resp = json.loads(f.read()) + + es_raw = mocker.patch( + "elasticsearch.connection.Urllib3HttpConnection.perform_request" + ) + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + ] + fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work") + fatcat_get_work_raw.side_effect = [ + fatcat_openapi_client.WorkEntity( + state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww", + ) + ] * 4 + fatcat_get_work_releases_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_work_releases" + ) + fatcat_get_work_releases_raw.side_effect = [ + [ + fatcat_openapi_client.ReleaseEntity( + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + ), + ] + ] * 4 + fatcat_get_release_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_release" + ) + fatcat_get_release_raw.side_effect = [ + fatcat_openapi_client.ReleaseEntity( + state="active", + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + files=[ + fatcat_openapi_client.FileEntity( + ident="ffffffffffffffffffffffffff", + urls=[ + fatcat_openapi_client.FileUrl( + rel="web", url="https://blarg.example.com", + ), + fatcat_openapi_client.FileUrl( + rel="webarchive", + url="https://web.archive.org/web/12345/https://example.com", + ), + fatcat_openapi_client.FileUrl( + rel="archive", + url="https://archive.org/download/some/thing.pdf", + ), + ], + ), + ], + ) + ] * 4 + + # redirects should work after API lookup, for both wayback and archive.org + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert ( + rv.headers["Location"] + == "https://web.archive.org/web/12345id_/https://example.com" + ) + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf" + + # wrong URLs should still not work, but display a page with helpful links + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"archive.org/download/some/thing.else.pdf" in rv.content + + def test_access_redirect_encoding(client: Any, mocker: Any) -> None: with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: -- cgit v1.2.3