diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:01:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:03:40 -0700 |
commit | e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch) | |
tree | a07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar/web.py | |
parent | d394c39c6a1b05033029c42377fbf40603b07bbd (diff) | |
download | fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip |
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing
services to pull the citation_pdf_url meta tag and bounce to a direct
IA PDF access URL.
For now the landing page stubs are just formatted as SERP results.
Presumbably these will get re-styled at some point and include citation
graph links, etc.
Diffstat (limited to 'fatcat_scholar/web.py')
-rw-r--r-- | fatcat_scholar/web.py | 92 |
1 files changed, 90 insertions, 2 deletions
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 895af18..56f2561 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict from pydantic import BaseModel import babel.numbers import babel.support -from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException +from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query from fastapi.staticfiles import StaticFiles -from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse +from fastapi.responses import ( + PlainTextResponse, + JSONResponse, + FileResponse, + RedirectResponse, +) from fastapi.middleware.cors import CORSMiddleware import sentry_sdk from sentry_sdk.integrations.asgi import SentryAsgiMiddleware @@ -26,6 +31,8 @@ from fatcat_scholar.search import ( FulltextQuery, FulltextHits, es_scholar_index_alive, + get_es_scholar_doc, + lookup_fulltext_pdf, ) from fatcat_scholar.schema import ScholarDoc @@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits: return hits +@api.get("/work/{work_ident}", operation_id="get_work") +def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict: + doc = get_es_scholar_doc(f"work_{work_ident}") + if not doc: + raise HTTPException(status_code=404, detail="work not found") + doc.pop("_obj", None) + return doc + + +def wayback_direct_url(url: str) -> str: + """ + Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) + """ + if not "://web.archive.org" in url: + return url + segments = url.split("/") + if len(segments) < 6 or not segments[4].isdigit(): + return url + segments[4] += "id_" + return "/".join(segments) + + +def test_wayback_direct_url() -> None: + assert ( + wayback_direct_url("http://fatcat.wiki/thing.pdf") + == "http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") + == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" + ) + == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" + ) + + +@api.get( + "/access-redirect/{sha1}.pdf", + operation_id="access_redirect_pdf", + include_in_schema=False, +) +def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any: + fulltext = lookup_fulltext_pdf(sha1) + if not fulltext or not fulltext.access_url: + raise HTTPException(status_code=404, detail="PDF file not found") + access_url = fulltext.access_url + if fulltext.access_type == "wayback": + access_url = wayback_direct_url(access_url) + return RedirectResponse(access_url, status_code=302) + + web = APIRouter() @@ -296,6 +357,33 @@ def web_search( ) +@web.get("/work/{work_ident}", include_in_schema=False) +def web_work( + request: Request, + response: Response, + work_ident: str = Query(..., min_length=20, max_length=20), + lang: LangPrefix = Depends(LangPrefix), + content: ContentNegotiation = Depends(ContentNegotiation), +) -> Any: + + if content.mimetype == "application/json": + return get_work(work_ident) + + doc = get_es_scholar_doc(f"work_{work_ident}") + if not doc: + raise HTTPException(status_code=404, detail="work not found") + + return i18n_templates[lang.code].TemplateResponse( + "work.html", + { + "request": request, + "locale": lang.code, + "lang_prefix": lang.prefix, + "doc": doc, + }, + ) + + app = FastAPI( title="Fatcat Scholar", description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.", |