diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:01:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:03:40 -0700 |
commit | e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch) | |
tree | a07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar | |
parent | d394c39c6a1b05033029c42377fbf40603b07bbd (diff) | |
download | fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip |
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing
services to pull the citation_pdf_url meta tag and bounce to a direct
IA PDF access URL.
For now the landing page stubs are just formatted as SERP results.
Presumbably these will get re-styled at some point and include citation
graph links, etc.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r-- | fatcat_scholar/search.py | 38 | ||||
-rw-r--r-- | fatcat_scholar/templates/search_macros.html | 2 | ||||
-rw-r--r-- | fatcat_scholar/templates/work.html | 48 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 92 |
4 files changed, 176 insertions, 4 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 5571909..c5fca35 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -21,7 +21,7 @@ from pydantic import BaseModel from fatcat_scholar.config import settings from fatcat_scholar.identifiers import * -from fatcat_scholar.schema import ScholarDoc +from fatcat_scholar.schema import ScholarDoc, ScholarFulltext from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query from fatcat_scholar.query_citation import try_fuzzy_match @@ -444,3 +444,39 @@ def es_scholar_index_alive() -> bool: return bool(resp["_shards"]["successful"] == resp["_shards"]["total"]) except KeyError: return False + + +def get_es_scholar_doc(key: str) -> Optional[dict]: + """ + Fetch a single document from search index, by key. Returns None if not found. + """ + try: + resp = es_client.get(settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX, key) + except elasticsearch.exceptions.NotFoundError: + return None + doc = resp["_source"] + try: + doc["_obj"] = ScholarDoc.parse_obj(doc) + except Exception: + pass + return doc + + +def lookup_fulltext_pdf(sha1: str) -> Optional[dict]: + """ + Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document. + """ + sha1 = sha1.lower() + assert len(sha1) == 40 and sha1.isalnum() + hits = do_lookup_query( + f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf"' + ) + if not hits.results: + return None + fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"]) + if not fulltext.access_type in ("ia_file", "wayback"): + return None + assert fulltext.file_sha1 == sha1 + assert fulltext.file_mimetype == "application/pdf" + assert fulltext.access_url + return fulltext diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index 9524d7e..63c988d 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -435,7 +435,7 @@ {{ platform_access_button(paper.biblio) }} {# ### COLLAPSED HITS #} - {% if paper._collapsed_count > 0 %} + {% if paper._collapsed_count and paper._collapsed_count > 0 %} <button class="ui basic compact blue labeled icon button serp-button" form="search_form" type="submit" name="collapse_key" value="{{ paper.collapse_key }}"> <i class="ui icon zoom-in"></i> {% trans trimmed count=paper._collapsed_count %} diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html new file mode 100644 index 0000000..92e334e --- /dev/null +++ b/fatcat_scholar/templates/work.html @@ -0,0 +1,48 @@ +{% import "search_macros.html" as search_macros %} +{% extends "base.html" %} + +{% block title %} +{{ doc.title }} +{% endblock %} + +{% block extra_head %} + <link rel="canonical" href="/work/{{ doc.work_ident }}"> + + <meta name="citation_title" content="{{ doc.biblio.title }}"> +{% for contrib in doc.biblio.contrib_names %} + <meta name="citation_author" content="{{ contrib }}"> +{% endfor %} +{% if doc.biblio.release_date or doc.biblio.release_year %} + <meta name="citation_publication_date" content="{{ doc.biblio.release_date or doc.biblio.release_year }}"> +{% endif %} +{% if doc.biblio.container_name %} + <meta name="citation_journal_title" content="{{ doc.biblio.container_name }}"> +{% endif %} + +{% if doc.biblio.volume %} + <meta name="citation_volume" content="{{ doc.biblio.volume }}"> +{% endif %} +{% if doc.biblio.issue %} + <meta name="citation_issue" content="{{ doc.biblio.issue }}"> +{% endif %} +{% if doc.biblio.pages %} + <meta name="citation_first_page" content="{{ doc.biblio.pages }}"> +{% endif %} +{% if doc.biblio.doi %} + <meta name="citation_doi" content="{{ doc.biblio.doi }}"> +{% endif %} +{% if doc.fulltext.access_url and doc.biblio.release_ident == doc.fulltext.release_ident and doc.fulltext.access_type in ['wayback', 'ia_file'] and doc.fulltext.file_mimetype == "application/pdf" and doc.fulltext.file_sha1 %} +<!-- PDF access redirect URL, as requested by, eg, scholar.google.com --> +<meta name="citation_pdf_url" content="/access-redirect/{{ doc.fulltext.file_sha1 }}.pdf"> +<!-- <meta name="citation_pdf_url" content="{{ doc.fulltext.access_url }}"> --> +{% endif %} + +{% endblock %} + +{% block fullmain %} +<div class="ui centered grid"> + <div class="ui fourteen wide column serp-column" style="margin-top: 2em;"> + {{ search_macros.fulltext_search_result_row(doc, locale=locale, debug_mode=False, expand=True) }} + </div> +</div> +{% endblock %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 895af18..56f2561 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict from pydantic import BaseModel import babel.numbers import babel.support -from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException +from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query from fastapi.staticfiles import StaticFiles -from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse +from fastapi.responses import ( + PlainTextResponse, + JSONResponse, + FileResponse, + RedirectResponse, +) from fastapi.middleware.cors import CORSMiddleware import sentry_sdk from sentry_sdk.integrations.asgi import SentryAsgiMiddleware @@ -26,6 +31,8 @@ from fatcat_scholar.search import ( FulltextQuery, FulltextHits, es_scholar_index_alive, + get_es_scholar_doc, + lookup_fulltext_pdf, ) from fatcat_scholar.schema import ScholarDoc @@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits: return hits +@api.get("/work/{work_ident}", operation_id="get_work") +def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict: + doc = get_es_scholar_doc(f"work_{work_ident}") + if not doc: + raise HTTPException(status_code=404, detail="work not found") + doc.pop("_obj", None) + return doc + + +def wayback_direct_url(url: str) -> str: + """ + Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) + """ + if not "://web.archive.org" in url: + return url + segments = url.split("/") + if len(segments) < 6 or not segments[4].isdigit(): + return url + segments[4] += "id_" + return "/".join(segments) + + +def test_wayback_direct_url() -> None: + assert ( + wayback_direct_url("http://fatcat.wiki/thing.pdf") + == "http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") + == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" + ) + assert ( + wayback_direct_url( + "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" + ) + == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" + ) + + +@api.get( + "/access-redirect/{sha1}.pdf", + operation_id="access_redirect_pdf", + include_in_schema=False, +) +def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any: + fulltext = lookup_fulltext_pdf(sha1) + if not fulltext or not fulltext.access_url: + raise HTTPException(status_code=404, detail="PDF file not found") + access_url = fulltext.access_url + if fulltext.access_type == "wayback": + access_url = wayback_direct_url(access_url) + return RedirectResponse(access_url, status_code=302) + + web = APIRouter() @@ -296,6 +357,33 @@ def web_search( ) +@web.get("/work/{work_ident}", include_in_schema=False) +def web_work( + request: Request, + response: Response, + work_ident: str = Query(..., min_length=20, max_length=20), + lang: LangPrefix = Depends(LangPrefix), + content: ContentNegotiation = Depends(ContentNegotiation), +) -> Any: + + if content.mimetype == "application/json": + return get_work(work_ident) + + doc = get_es_scholar_doc(f"work_{work_ident}") + if not doc: + raise HTTPException(status_code=404, detail="work not found") + + return i18n_templates[lang.code].TemplateResponse( + "work.html", + { + "request": request, + "locale": lang.code, + "lang_prefix": lang.prefix, + "doc": doc, + }, + ) + + app = FastAPI( title="Fatcat Scholar", description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.", |