From 3bae05c4a4cd7d6d9b892b952b7ca35454319479 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 11 Jun 2021 15:03:32 -0700 Subject: update access redirect URL endpoints --- fatcat_scholar/search.py | 25 +-------------- fatcat_scholar/web.py | 83 +++++++++++++++++++++++++++++------------------- tests/test_web.py | 39 ++++++++++++----------- 3 files changed, 72 insertions(+), 75 deletions(-) diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index f5056c7..121cb69 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -21,7 +21,7 @@ from pydantic import BaseModel from fatcat_scholar.config import settings from fatcat_scholar.identifiers import * -from fatcat_scholar.schema import ScholarDoc, ScholarFulltext +from fatcat_scholar.schema import ScholarDoc from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query from fatcat_scholar.query_citation import try_fuzzy_match @@ -464,26 +464,3 @@ def get_es_scholar_doc(key: str) -> Optional[dict]: except Exception: pass return doc - - -def lookup_fulltext_pdf(sha1: str) -> Optional[ScholarFulltext]: - """ - Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document. - """ - sha1 = sha1.lower() - assert len(sha1) == 40 and sha1.isalnum() - hits = do_lookup_query( - f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf" fulltext.access_url:*' - ) - if not hits.results: - return None - fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"]) - if not fulltext.access_type in ("ia_file", "wayback"): - return None - if fulltext.file_sha1 != sha1: - return None - if fulltext.file_mimetype != "application/pdf": - return None - if not fulltext.access_url: - return None - return fulltext diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 253d99c..b5af18e 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -29,7 +29,6 @@ from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.hacks import ( Jinja2Templates, parse_accept_lang, - wayback_direct_url, make_access_redirect_url, ) from fatcat_scholar.search import ( @@ -38,7 +37,6 @@ from fatcat_scholar.search import ( FulltextHits, es_scholar_index_alive, get_es_scholar_doc, - lookup_fulltext_pdf, ) from fatcat_scholar.schema import ScholarDoc @@ -185,48 +183,69 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict @api.get( - "/access-redirect/{sha1}.pdf", - operation_id="access_redirect_pdf", - include_in_schema=False, -) -def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any: - """ - NOTE: DEPRECATED - """ - fulltext = lookup_fulltext_pdf(sha1) - if not fulltext or not fulltext.access_url: - raise HTTPException(status_code=404, detail="PDF file not found") - access_url = fulltext.access_url - if fulltext.access_type == "wayback": - access_url = wayback_direct_url(access_url) - return RedirectResponse(access_url, status_code=302) - - -@api.get( - "/access/wayback/{timestamp}/{url:path}", + "/work/{work_ident}/access/wayback/{url:path}", operation_id="access_redirect_wayback", include_in_schema=False, ) -def access_redirect_wayback(timestamp: int, url: str, request: Request) -> Any: - original_url = "/".join(str(request.url).split("/")[6:]) +def access_redirect_wayback( + url: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + raw_original_url = "/".join(str(request.url).split("/")[7:]) # the quote() call is necessary because the URL is un-encoded in the path parameter # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d - access_url = urllib.parse.quote( - f"https://web.archive.org/web/{timestamp}id_/{original_url}", - safe=":/%#?=@[]!$&'()*+,;", - ) - return RedirectResponse(access_url, status_code=302) + original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + raise HTTPException(status_code=404, detail="work not found") + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if ( + opt.access_type == "wayback" + and opt.access_url + and "://web.archive.org/web/" in opt.access_url + and opt.access_url.endswith(original_url) + ): + timestamp = opt.access_url.split("/")[4] + if not (len(timestamp) == 14 and timestamp.isdigit()): + continue + access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" + return RedirectResponse(access_url, status_code=302) + raise HTTPException(status_code=404, detail="access URL not found") @api.get( - "/access/ia_file/{item}/{file_path:path}", + "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", operation_id="access_redirect_ia_file", include_in_schema=False, ) -def access_redirect_ia_file(item: str, file_path: str, request: Request) -> Any: - original_path = urllib.parse.quote("/".join(str(request.url).split("/")[6:])) +def access_redirect_ia_file( + item: str, + file_path: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) access_url = f"https://archive.org/download/{item}/{original_path}" - return RedirectResponse(access_url, status_code=302) + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + raise HTTPException(status_code=404, detail="work not found") + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if opt.access_type == "ia_file" and opt.access_url == access_url: + return RedirectResponse(access_url, status_code=302) + raise HTTPException(status_code=404, detail="access URL not found") web = APIRouter() diff --git a/tests/test_web.py b/tests/test_web.py index ee11ee6..7da5880 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -125,11 +125,8 @@ def test_basic_work_landing_page(client: Any, mocker: Any) -> None: def test_basic_access_redirect(client: Any, mocker: Any) -> None: - """ - NOTE: DEPRECATED - """ - with open("tests/files/elastic_fulltext_search.json") as f: + with open("tests/files/elastic_fulltext_get.json") as f: elastic_resp = json.loads(f.read()) es_raw = mocker.patch( @@ -141,7 +138,7 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: ] rv = client.get( - "/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf", + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf", allow_redirects=False, ) assert rv.status_code == 302 @@ -150,39 +147,43 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" ) + # check that URL is validated rv = client.get( - "/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf", + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, ) assert rv.status_code == 404 -def test_access_redirects(client: Any, mocker: Any) -> None: +def test_access_redirect_encoding(client: Any, mocker: Any) -> None: - # tricky "URL encoding in archive.org path" case - rv = client.get( - "/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf", - allow_redirects=False, - ) - assert rv.status_code == 302 - assert ( - rv.headers["Location"] - == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf" + with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: + elastic_ia_resp = json.loads(f.read()) + with open("tests/files/elastic_get_work_ao5l3ykgbvg2vfpqe2y5qold5y.json") as f: + elastic_wayback_resp = json.loads(f.read()) + + es_raw = mocker.patch( + "elasticsearch.connection.Urllib3HttpConnection.perform_request" ) + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_ia_resp)), + (200, {}, json.dumps(elastic_wayback_resp)), + ] + # tricky "URL encoding in archive.org path" case rv = client.get( - "/access/wayback/20170814015956/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf", + "/work/a6gvpil4brdgzhqyaog3ftngqe/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf", allow_redirects=False, ) assert rv.status_code == 302 assert ( rv.headers["Location"] - == "https://web.archive.org/web/20170814015956id_/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf" + == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf" ) # spaces ("%20" vs "+") rv = client.get( - "/access/wayback/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf", + "/work/ao5l3ykgbvg2vfpqe2y5qold5y/access/wayback/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf", allow_redirects=False, ) assert rv.status_code == 302 -- cgit v1.2.3