aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/web.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-23 19:01:22 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-23 19:03:40 -0700
commite35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
treea07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar/web.py
parentd394c39c6a1b05033029c42377fbf40603b07bbd (diff)
downloadfatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz
fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing services to pull the citation_pdf_url meta tag and bounce to a direct IA PDF access URL. For now the landing page stubs are just formatted as SERP results. Presumbably these will get re-styled at some point and include citation graph links, etc.
Diffstat (limited to 'fatcat_scholar/web.py')
-rw-r--r--fatcat_scholar/web.py92
1 files changed, 90 insertions, 2 deletions
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 895af18..56f2561 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict
from pydantic import BaseModel
import babel.numbers
import babel.support
-from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException
+from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query
from fastapi.staticfiles import StaticFiles
-from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse
+from fastapi.responses import (
+ PlainTextResponse,
+ JSONResponse,
+ FileResponse,
+ RedirectResponse,
+)
from fastapi.middleware.cors import CORSMiddleware
import sentry_sdk
from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
@@ -26,6 +31,8 @@ from fatcat_scholar.search import (
FulltextQuery,
FulltextHits,
es_scholar_index_alive,
+ get_es_scholar_doc,
+ lookup_fulltext_pdf,
)
from fatcat_scholar.schema import ScholarDoc
@@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:
return hits
+@api.get("/work/{work_ident}", operation_id="get_work")
+def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict:
+ doc = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc:
+ raise HTTPException(status_code=404, detail="work not found")
+ doc.pop("_obj", None)
+ return doc
+
+
+def wayback_direct_url(url: str) -> str:
+ """
+ Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
+ """
+ if not "://web.archive.org" in url:
+ return url
+ segments = url.split("/")
+ if len(segments) < 6 or not segments[4].isdigit():
+ return url
+ segments[4] += "id_"
+ return "/".join(segments)
+
+
+def test_wayback_direct_url() -> None:
+ assert (
+ wayback_direct_url("http://fatcat.wiki/thing.pdf")
+ == "http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
+ == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ wayback_direct_url(
+ "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+ )
+ == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
+ )
+
+
+@api.get(
+ "/access-redirect/{sha1}.pdf",
+ operation_id="access_redirect_pdf",
+ include_in_schema=False,
+)
+def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
+ fulltext = lookup_fulltext_pdf(sha1)
+ if not fulltext or not fulltext.access_url:
+ raise HTTPException(status_code=404, detail="PDF file not found")
+ access_url = fulltext.access_url
+ if fulltext.access_type == "wayback":
+ access_url = wayback_direct_url(access_url)
+ return RedirectResponse(access_url, status_code=302)
+
+
web = APIRouter()
@@ -296,6 +357,33 @@ def web_search(
)
+@web.get("/work/{work_ident}", include_in_schema=False)
+def web_work(
+ request: Request,
+ response: Response,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+ lang: LangPrefix = Depends(LangPrefix),
+ content: ContentNegotiation = Depends(ContentNegotiation),
+) -> Any:
+
+ if content.mimetype == "application/json":
+ return get_work(work_ident)
+
+ doc = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc:
+ raise HTTPException(status_code=404, detail="work not found")
+
+ return i18n_templates[lang.code].TemplateResponse(
+ "work.html",
+ {
+ "request": request,
+ "locale": lang.code,
+ "lang_prefix": lang.prefix,
+ "doc": doc,
+ },
+ )
+
+
app = FastAPI(
title="Fatcat Scholar",
description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",