web: initial implementation of work landing page and citation_pdf_url access redirect

The initial intent is to have something that can be used by indexing services to pull the citation_pdf_url meta tag and bounce to a direct IA PDF access URL. For now the landing page stubs are just formatted as SERP results. Presumbably these will get re-styled at some point and include citation graph links, etc.
author: Bryan Newbold <bnewbold@archive.org> 2021-04-23 19:01:22 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-04-23 19:03:40 -0700
commit: e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
tree: a07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar/web.py
parent: d394c39c6a1b05033029c42377fbf40603b07bbd (diff)
download: fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz
fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip
1 files changed, 90 insertions, 2 deletions
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 895af18..56f2561 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict
 from pydantic import BaseModel
 import babel.numbers
 import babel.support
-from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException
+from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query
 from fastapi.staticfiles import StaticFiles
-from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse
+from fastapi.responses import (
+    PlainTextResponse,
+    JSONResponse,
+    FileResponse,
+    RedirectResponse,
+)
 from fastapi.middleware.cors import CORSMiddleware
 import sentry_sdk
 from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
@@ -26,6 +31,8 @@ from fatcat_scholar.search import (
     FulltextQuery,
     FulltextHits,
     es_scholar_index_alive,
+    get_es_scholar_doc,
+    lookup_fulltext_pdf,
 )
 from fatcat_scholar.schema import ScholarDoc
 
@@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:
     return hits
 
 
+@api.get("/work/{work_ident}", operation_id="get_work")
+def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict:
+    doc = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc:
+        raise HTTPException(status_code=404, detail="work not found")
+    doc.pop("_obj", None)
+    return doc
+
+
+def wayback_direct_url(url: str) -> str:
+    """
+    Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
+    """
+    if not "://web.archive.org" in url:
+        return url
+    segments = url.split("/")
+    if len(segments) < 6 or not segments[4].isdigit():
+        return url
+    segments[4] += "id_"
+    return "/".join(segments)
+
+
+def test_wayback_direct_url() -> None:
+    assert (
+        wayback_direct_url("http://fatcat.wiki/thing.pdf")
+        == "http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
+        == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        wayback_direct_url(
+            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+        )
+        == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
+    )
+
+
+@api.get(
+    "/access-redirect/{sha1}.pdf",
+    operation_id="access_redirect_pdf",
+    include_in_schema=False,
+)
+def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
+    fulltext = lookup_fulltext_pdf(sha1)
+    if not fulltext or not fulltext.access_url:
+        raise HTTPException(status_code=404, detail="PDF file not found")
+    access_url = fulltext.access_url
+    if fulltext.access_type == "wayback":
+        access_url = wayback_direct_url(access_url)
+    return RedirectResponse(access_url, status_code=302)
+
+
 web = APIRouter()
 
 
@@ -296,6 +357,33 @@ def web_search(
     )
 
 
+@web.get("/work/{work_ident}", include_in_schema=False)
+def web_work(
+    request: Request,
+    response: Response,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+    lang: LangPrefix = Depends(LangPrefix),
+    content: ContentNegotiation = Depends(ContentNegotiation),
+) -> Any:
+
+    if content.mimetype == "application/json":
+        return get_work(work_ident)
+
+    doc = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc:
+        raise HTTPException(status_code=404, detail="work not found")
+
+    return i18n_templates[lang.code].TemplateResponse(
+        "work.html",
+        {
+            "request": request,
+            "locale": lang.code,
+            "lang_prefix": lang.prefix,
+            "doc": doc,
+        },
+    )
+
+
 app = FastAPI(
     title="Fatcat Scholar",
     description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",
author	Bryan Newbold <bnewbold@archive.org>	2021-04-23 19:01:22 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-04-23 19:03:40 -0700
commit	e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
tree	a07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar/web.py
parent	d394c39c6a1b05033029c42377fbf40603b07bbd (diff)
download	fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip