diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:01:22 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:03:40 -0700 | 
| commit | e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch) | |
| tree | a07ff995c7c64362c895dea3933060a832d04c38 | |
| parent | d394c39c6a1b05033029c42377fbf40603b07bbd (diff) | |
| download | fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip | |
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing
services to pull the citation_pdf_url meta tag and bounce to a direct
IA PDF access URL.
For now the landing page stubs are just formatted as SERP results.
Presumbably these will get re-styled at some point and include citation
graph links, etc.
| -rw-r--r-- | fatcat_scholar/search.py | 38 | ||||
| -rw-r--r-- | fatcat_scholar/templates/search_macros.html | 2 | ||||
| -rw-r--r-- | fatcat_scholar/templates/work.html | 48 | ||||
| -rw-r--r-- | fatcat_scholar/web.py | 92 | 
4 files changed, 176 insertions, 4 deletions
| diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 5571909..c5fca35 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -21,7 +21,7 @@ from pydantic import BaseModel  from fatcat_scholar.config import settings  from fatcat_scholar.identifiers import * -from fatcat_scholar.schema import ScholarDoc +from fatcat_scholar.schema import ScholarDoc, ScholarFulltext  from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query  from fatcat_scholar.query_citation import try_fuzzy_match @@ -444,3 +444,39 @@ def es_scholar_index_alive() -> bool:          return bool(resp["_shards"]["successful"] == resp["_shards"]["total"])      except KeyError:          return False + + +def get_es_scholar_doc(key: str) -> Optional[dict]: +    """ +    Fetch a single document from search index, by key. Returns None if not found. +    """ +    try: +        resp = es_client.get(settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX, key) +    except elasticsearch.exceptions.NotFoundError: +        return None +    doc = resp["_source"] +    try: +        doc["_obj"] = ScholarDoc.parse_obj(doc) +    except Exception: +        pass +    return doc + + +def lookup_fulltext_pdf(sha1: str) -> Optional[dict]: +    """ +    Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document. +    """ +    sha1 = sha1.lower() +    assert len(sha1) == 40 and sha1.isalnum() +    hits = do_lookup_query( +        f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf"' +    ) +    if not hits.results: +        return None +    fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"]) +    if not fulltext.access_type in ("ia_file", "wayback"): +        return None +    assert fulltext.file_sha1 == sha1 +    assert fulltext.file_mimetype == "application/pdf" +    assert fulltext.access_url +    return fulltext diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index 9524d7e..63c988d 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -435,7 +435,7 @@    {{ platform_access_button(paper.biblio) }}    {# ### COLLAPSED HITS  #} -  {% if paper._collapsed_count > 0 %} +  {% if paper._collapsed_count and paper._collapsed_count > 0 %}      <button class="ui basic compact blue labeled icon button serp-button" form="search_form" type="submit" name="collapse_key" value="{{ paper.collapse_key }}">        <i class="ui icon zoom-in"></i>        {% trans trimmed count=paper._collapsed_count %} diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html new file mode 100644 index 0000000..92e334e --- /dev/null +++ b/fatcat_scholar/templates/work.html @@ -0,0 +1,48 @@ +{% import "search_macros.html" as search_macros %} +{% extends "base.html" %} + +{% block title %} +{{ doc.title }} +{% endblock %} + +{% block extra_head %} +  <link rel="canonical" href="/work/{{ doc.work_ident }}"> + +  <meta name="citation_title" content="{{ doc.biblio.title }}"> +{% for contrib in doc.biblio.contrib_names %} +  <meta name="citation_author" content="{{ contrib }}"> +{% endfor %} +{% if doc.biblio.release_date or doc.biblio.release_year %} +  <meta name="citation_publication_date" content="{{ doc.biblio.release_date or doc.biblio.release_year }}"> +{% endif %} +{% if doc.biblio.container_name %} +  <meta name="citation_journal_title" content="{{ doc.biblio.container_name }}"> +{% endif %} + +{% if doc.biblio.volume %} +  <meta name="citation_volume" content="{{ doc.biblio.volume }}"> +{% endif %} +{% if doc.biblio.issue %} +  <meta name="citation_issue" content="{{ doc.biblio.issue }}"> +{% endif %} +{% if doc.biblio.pages %} +  <meta name="citation_first_page" content="{{ doc.biblio.pages }}"> +{% endif %} +{% if doc.biblio.doi %} +  <meta name="citation_doi" content="{{ doc.biblio.doi }}"> +{% endif %} +{% if doc.fulltext.access_url and doc.biblio.release_ident == doc.fulltext.release_ident and doc.fulltext.access_type in ['wayback', 'ia_file'] and doc.fulltext.file_mimetype == "application/pdf" and doc.fulltext.file_sha1 %} +<!-- PDF access redirect URL, as requested by, eg, scholar.google.com --> +<meta name="citation_pdf_url" content="/access-redirect/{{ doc.fulltext.file_sha1 }}.pdf"> +<!-- <meta name="citation_pdf_url" content="{{ doc.fulltext.access_url }}"> --> +{% endif %} + +{% endblock %} + +{% block fullmain %} +<div class="ui centered grid"> +  <div class="ui fourteen wide column serp-column" style="margin-top: 2em;"> +    {{ search_macros.fulltext_search_result_row(doc, locale=locale, debug_mode=False, expand=True) }} +  </div> +</div> +{% endblock %} diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index 895af18..56f2561 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict  from pydantic import BaseModel  import babel.numbers  import babel.support -from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException +from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query  from fastapi.staticfiles import StaticFiles -from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse +from fastapi.responses import ( +    PlainTextResponse, +    JSONResponse, +    FileResponse, +    RedirectResponse, +)  from fastapi.middleware.cors import CORSMiddleware  import sentry_sdk  from sentry_sdk.integrations.asgi import SentryAsgiMiddleware @@ -26,6 +31,8 @@ from fatcat_scholar.search import (      FulltextQuery,      FulltextHits,      es_scholar_index_alive, +    get_es_scholar_doc, +    lookup_fulltext_pdf,  )  from fatcat_scholar.schema import ScholarDoc @@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:      return hits +@api.get("/work/{work_ident}", operation_id="get_work") +def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict: +    doc = get_es_scholar_doc(f"work_{work_ident}") +    if not doc: +        raise HTTPException(status_code=404, detail="work not found") +    doc.pop("_obj", None) +    return doc + + +def wayback_direct_url(url: str) -> str: +    """ +    Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access) +    """ +    if not "://web.archive.org" in url: +        return url +    segments = url.split("/") +    if len(segments) < 6 or not segments[4].isdigit(): +        return url +    segments[4] += "id_" +    return "/".join(segments) + + +def test_wayback_direct_url() -> None: +    assert ( +        wayback_direct_url("http://fatcat.wiki/thing.pdf") +        == "http://fatcat.wiki/thing.pdf" +    ) +    assert ( +        wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf") +        == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf" +    ) +    assert ( +        wayback_direct_url( +            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf" +        ) +        == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf" +    ) + + +@api.get( +    "/access-redirect/{sha1}.pdf", +    operation_id="access_redirect_pdf", +    include_in_schema=False, +) +def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any: +    fulltext = lookup_fulltext_pdf(sha1) +    if not fulltext or not fulltext.access_url: +        raise HTTPException(status_code=404, detail="PDF file not found") +    access_url = fulltext.access_url +    if fulltext.access_type == "wayback": +        access_url = wayback_direct_url(access_url) +    return RedirectResponse(access_url, status_code=302) + +  web = APIRouter() @@ -296,6 +357,33 @@ def web_search(      ) +@web.get("/work/{work_ident}", include_in_schema=False) +def web_work( +    request: Request, +    response: Response, +    work_ident: str = Query(..., min_length=20, max_length=20), +    lang: LangPrefix = Depends(LangPrefix), +    content: ContentNegotiation = Depends(ContentNegotiation), +) -> Any: + +    if content.mimetype == "application/json": +        return get_work(work_ident) + +    doc = get_es_scholar_doc(f"work_{work_ident}") +    if not doc: +        raise HTTPException(status_code=404, detail="work not found") + +    return i18n_templates[lang.code].TemplateResponse( +        "work.html", +        { +            "request": request, +            "locale": lang.code, +            "lang_prefix": lang.prefix, +            "doc": doc, +        }, +    ) + +  app = FastAPI(      title="Fatcat Scholar",      description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.", | 
