aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/search.py25
-rw-r--r--fatcat_scholar/web.py83
-rw-r--r--tests/test_web.py39
3 files changed, 72 insertions, 75 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index f5056c7..121cb69 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -21,7 +21,7 @@ from pydantic import BaseModel
from fatcat_scholar.config import settings
from fatcat_scholar.identifiers import *
-from fatcat_scholar.schema import ScholarDoc, ScholarFulltext
+from fatcat_scholar.schema import ScholarDoc
from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query
from fatcat_scholar.query_citation import try_fuzzy_match
@@ -464,26 +464,3 @@ def get_es_scholar_doc(key: str) -> Optional[dict]:
except Exception:
pass
return doc
-
-
-def lookup_fulltext_pdf(sha1: str) -> Optional[ScholarFulltext]:
- """
- Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document.
- """
- sha1 = sha1.lower()
- assert len(sha1) == 40 and sha1.isalnum()
- hits = do_lookup_query(
- f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf" fulltext.access_url:*'
- )
- if not hits.results:
- return None
- fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"])
- if not fulltext.access_type in ("ia_file", "wayback"):
- return None
- if fulltext.file_sha1 != sha1:
- return None
- if fulltext.file_mimetype != "application/pdf":
- return None
- if not fulltext.access_url:
- return None
- return fulltext
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 253d99c..b5af18e 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -29,7 +29,6 @@ from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.hacks import (
Jinja2Templates,
parse_accept_lang,
- wayback_direct_url,
make_access_redirect_url,
)
from fatcat_scholar.search import (
@@ -38,7 +37,6 @@ from fatcat_scholar.search import (
FulltextHits,
es_scholar_index_alive,
get_es_scholar_doc,
- lookup_fulltext_pdf,
)
from fatcat_scholar.schema import ScholarDoc
@@ -185,48 +183,69 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
@api.get(
- "/access-redirect/{sha1}.pdf",
- operation_id="access_redirect_pdf",
- include_in_schema=False,
-)
-def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
- """
- NOTE: DEPRECATED
- """
- fulltext = lookup_fulltext_pdf(sha1)
- if not fulltext or not fulltext.access_url:
- raise HTTPException(status_code=404, detail="PDF file not found")
- access_url = fulltext.access_url
- if fulltext.access_type == "wayback":
- access_url = wayback_direct_url(access_url)
- return RedirectResponse(access_url, status_code=302)
-
-
-@api.get(
- "/access/wayback/{timestamp}/{url:path}",
+ "/work/{work_ident}/access/wayback/{url:path}",
operation_id="access_redirect_wayback",
include_in_schema=False,
)
-def access_redirect_wayback(timestamp: int, url: str, request: Request) -> Any:
- original_url = "/".join(str(request.url).split("/")[6:])
+def access_redirect_wayback(
+ url: str,
+ request: Request,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+ raw_original_url = "/".join(str(request.url).split("/")[7:])
# the quote() call is necessary because the URL is un-encoded in the path parameter
# see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
- access_url = urllib.parse.quote(
- f"https://web.archive.org/web/{timestamp}id_/{original_url}",
- safe=":/%#?=@[]!$&'()*+,;",
- )
- return RedirectResponse(access_url, status_code=302)
+ original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+ doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc_dict:
+ raise HTTPException(status_code=404, detail="work not found")
+ doc: ScholarDoc = doc_dict["_obj"]
+ # combine fulltext with all access options
+ access: List[Any] = []
+ if doc.fulltext:
+ access.append(doc.fulltext)
+ access.extend(doc.access or [])
+ for opt in access:
+ if (
+ opt.access_type == "wayback"
+ and opt.access_url
+ and "://web.archive.org/web/" in opt.access_url
+ and opt.access_url.endswith(original_url)
+ ):
+ timestamp = opt.access_url.split("/")[4]
+ if not (len(timestamp) == 14 and timestamp.isdigit()):
+ continue
+ access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+ return RedirectResponse(access_url, status_code=302)
+ raise HTTPException(status_code=404, detail="access URL not found")
@api.get(
- "/access/ia_file/{item}/{file_path:path}",
+ "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
operation_id="access_redirect_ia_file",
include_in_schema=False,
)
-def access_redirect_ia_file(item: str, file_path: str, request: Request) -> Any:
- original_path = urllib.parse.quote("/".join(str(request.url).split("/")[6:]))
+def access_redirect_ia_file(
+ item: str,
+ file_path: str,
+ request: Request,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+ original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
access_url = f"https://archive.org/download/{item}/{original_path}"
- return RedirectResponse(access_url, status_code=302)
+ doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc_dict:
+ raise HTTPException(status_code=404, detail="work not found")
+ doc: ScholarDoc = doc_dict["_obj"]
+ # combine fulltext with all access options
+ access: List[Any] = []
+ if doc.fulltext:
+ access.append(doc.fulltext)
+ access.extend(doc.access or [])
+ for opt in access:
+ if opt.access_type == "ia_file" and opt.access_url == access_url:
+ return RedirectResponse(access_url, status_code=302)
+ raise HTTPException(status_code=404, detail="access URL not found")
web = APIRouter()
diff --git a/tests/test_web.py b/tests/test_web.py
index ee11ee6..7da5880 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -125,11 +125,8 @@ def test_basic_work_landing_page(client: Any, mocker: Any) -> None:
def test_basic_access_redirect(client: Any, mocker: Any) -> None:
- """
- NOTE: DEPRECATED
- """
- with open("tests/files/elastic_fulltext_search.json") as f:
+ with open("tests/files/elastic_fulltext_get.json") as f:
elastic_resp = json.loads(f.read())
es_raw = mocker.patch(
@@ -141,7 +138,7 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
]
rv = client.get(
- "/access-redirect/f81f84e23c9ba5d364c70f01fa26e645d29c0427.pdf",
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf",
allow_redirects=False,
)
assert rv.status_code == 302
@@ -150,39 +147,43 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
== "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
)
+ # check that URL is validated
rv = client.get(
- "/access-redirect/aaaaaaaaaaaaaaaaaaaaaa01fa26e645d29c0427.pdf",
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
allow_redirects=False,
)
assert rv.status_code == 404
-def test_access_redirects(client: Any, mocker: Any) -> None:
+def test_access_redirect_encoding(client: Any, mocker: Any) -> None:
- # tricky "URL encoding in archive.org path" case
- rv = client.get(
- "/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf",
- allow_redirects=False,
- )
- assert rv.status_code == 302
- assert (
- rv.headers["Location"]
- == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf"
+ with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f:
+ elastic_ia_resp = json.loads(f.read())
+ with open("tests/files/elastic_get_work_ao5l3ykgbvg2vfpqe2y5qold5y.json") as f:
+ elastic_wayback_resp = json.loads(f.read())
+
+ es_raw = mocker.patch(
+ "elasticsearch.connection.Urllib3HttpConnection.perform_request"
)
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_ia_resp)),
+ (200, {}, json.dumps(elastic_wayback_resp)),
+ ]
+ # tricky "URL encoding in archive.org path" case
rv = client.get(
- "/access/wayback/20170814015956/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf",
+ "/work/a6gvpil4brdgzhqyaog3ftngqe/access/ia_file/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf",
allow_redirects=False,
)
assert rv.status_code == 302
assert (
rv.headers["Location"]
- == "https://web.archive.org/web/20170814015956id_/https://epub.uni-regensburg.de/21901/1/lorenz73.pdf"
+ == "https://archive.org/download/crossref-pre-1909-scholarly-works/10.1016%252Fs0140-6736%252802%252912493-7.zip/10.1016%252Fs0140-6736%252802%252912928-x.pdf"
)
# spaces ("%20" vs "+")
rv = client.get(
- "/access/wayback/20170811115414/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf",
+ "/work/ao5l3ykgbvg2vfpqe2y5qold5y/access/wayback/http://sudjms.net/issues/5-4/pdf/8)A%20comparison%20study%20of%20histochemical%20staining%20of%20various%20tissues%20after.pdf",
allow_redirects=False,
)
assert rv.status_code == 302