web: initial implementation of work landing page and citation_pdf_url access redirect

The initial intent is to have something that can be used by indexing services to pull the citation_pdf_url meta tag and bounce to a direct IA PDF access URL. For now the landing page stubs are just formatted as SERP results. Presumbably these will get re-styled at some point and include citation graph links, etc.
author: Bryan Newbold <bnewbold@archive.org> 2021-04-23 19:01:22 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2021-04-23 19:03:40 -0700
commit: e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
tree: a07ff995c7c64362c895dea3933060a832d04c38
parent: d394c39c6a1b05033029c42377fbf40603b07bbd (diff)
download: fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz
fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip
4 files changed, 176 insertions, 4 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 5571909..c5fca35 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -21,7 +21,7 @@ from pydantic import BaseModel
 
 from fatcat_scholar.config import settings
 from fatcat_scholar.identifiers import *
-from fatcat_scholar.schema import ScholarDoc
+from fatcat_scholar.schema import ScholarDoc, ScholarFulltext
 from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query
 from fatcat_scholar.query_citation import try_fuzzy_match
 
@@ -444,3 +444,39 @@ def es_scholar_index_alive() -> bool:
         return bool(resp["_shards"]["successful"] == resp["_shards"]["total"])
     except KeyError:
         return False
+
+
+def get_es_scholar_doc(key: str) -> Optional[dict]:
+    """
+    Fetch a single document from search index, by key. Returns None if not found.
+    """
+    try:
+        resp = es_client.get(settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX, key)
+    except elasticsearch.exceptions.NotFoundError:
+        return None
+    doc = resp["_source"]
+    try:
+        doc["_obj"] = ScholarDoc.parse_obj(doc)
+    except Exception:
+        pass
+    return doc
+
+
+def lookup_fulltext_pdf(sha1: str) -> Optional[dict]:
+    """
+    Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document.
+    """
+    sha1 = sha1.lower()
+    assert len(sha1) == 40 and sha1.isalnum()
+    hits = do_lookup_query(
+        f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf"'
+    )
+    if not hits.results:
+        return None
+    fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"])
+    if not fulltext.access_type in ("ia_file", "wayback"):
+        return None
+    assert fulltext.file_sha1 == sha1
+    assert fulltext.file_mimetype == "application/pdf"
+    assert fulltext.access_url
+    return fulltext
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index 9524d7e..63c988d 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -435,7 +435,7 @@
   {{ platform_access_button(paper.biblio) }}
 
   {# ### COLLAPSED HITS  #}
-  {% if paper._collapsed_count > 0 %}
+  {% if paper._collapsed_count and paper._collapsed_count > 0 %}
     <button class="ui basic compact blue labeled icon button serp-button" form="search_form" type="submit" name="collapse_key" value="{{ paper.collapse_key }}">
       <i class="ui icon zoom-in"></i>
       {% trans trimmed count=paper._collapsed_count %}
diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html
new file mode 100644
index 0000000..92e334e
--- /dev/null
+++ b/fatcat_scholar/templates/work.html
@@ -0,0 +1,48 @@
+{% import "search_macros.html" as search_macros %}
+{% extends "base.html" %}
+
+{% block title %}
+{{ doc.title }}
+{% endblock %}
+
+{% block extra_head %}
+  <link rel="canonical" href="/work/{{ doc.work_ident }}">
+
+  <meta name="citation_title" content="{{ doc.biblio.title }}">
+{% for contrib in doc.biblio.contrib_names %}
+  <meta name="citation_author" content="{{ contrib }}">
+{% endfor %}
+{% if doc.biblio.release_date or doc.biblio.release_year %}
+  <meta name="citation_publication_date" content="{{ doc.biblio.release_date or doc.biblio.release_year }}">
+{% endif %}
+{% if doc.biblio.container_name %}
+  <meta name="citation_journal_title" content="{{ doc.biblio.container_name }}">
+{% endif %}
+
+{% if doc.biblio.volume %}
+  <meta name="citation_volume" content="{{ doc.biblio.volume }}">
+{% endif %}
+{% if doc.biblio.issue %}
+  <meta name="citation_issue" content="{{ doc.biblio.issue }}">
+{% endif %}
+{% if doc.biblio.pages %}
+  <meta name="citation_first_page" content="{{ doc.biblio.pages }}">
+{% endif %}
+{% if doc.biblio.doi %}
+  <meta name="citation_doi" content="{{ doc.biblio.doi }}">
+{% endif %}
+{% if doc.fulltext.access_url and doc.biblio.release_ident == doc.fulltext.release_ident and doc.fulltext.access_type in ['wayback', 'ia_file'] and doc.fulltext.file_mimetype == "application/pdf" and doc.fulltext.file_sha1 %}
+<!-- PDF access redirect URL, as requested by, eg, scholar.google.com -->
+<meta name="citation_pdf_url" content="/access-redirect/{{ doc.fulltext.file_sha1 }}.pdf">
+<!-- <meta name="citation_pdf_url" content="{{ doc.fulltext.access_url }}"> -->
+{% endif %}
+
+{% endblock %}
+
+{% block fullmain %}
+<div class="ui centered grid">
+  <div class="ui fourteen wide column serp-column" style="margin-top: 2em;">
+    {{ search_macros.fulltext_search_result_row(doc, locale=locale, debug_mode=False, expand=True) }}
+  </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 895af18..56f2561 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict
 from pydantic import BaseModel
 import babel.numbers
 import babel.support
-from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException
+from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query
 from fastapi.staticfiles import StaticFiles
-from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse
+from fastapi.responses import (
+    PlainTextResponse,
+    JSONResponse,
+    FileResponse,
+    RedirectResponse,
+)
 from fastapi.middleware.cors import CORSMiddleware
 import sentry_sdk
 from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
@@ -26,6 +31,8 @@ from fatcat_scholar.search import (
     FulltextQuery,
     FulltextHits,
     es_scholar_index_alive,
+    get_es_scholar_doc,
+    lookup_fulltext_pdf,
 )
 from fatcat_scholar.schema import ScholarDoc
 
@@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:
     return hits
 
 
+@api.get("/work/{work_ident}", operation_id="get_work")
+def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict:
+    doc = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc:
+        raise HTTPException(status_code=404, detail="work not found")
+    doc.pop("_obj", None)
+    return doc
+
+
+def wayback_direct_url(url: str) -> str:
+    """
+    Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
+    """
+    if not "://web.archive.org" in url:
+        return url
+    segments = url.split("/")
+    if len(segments) < 6 or not segments[4].isdigit():
+        return url
+    segments[4] += "id_"
+    return "/".join(segments)
+
+
+def test_wayback_direct_url() -> None:
+    assert (
+        wayback_direct_url("http://fatcat.wiki/thing.pdf")
+        == "http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
+        == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
+    )
+    assert (
+        wayback_direct_url(
+            "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+        )
+        == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
+    )
+
+
+@api.get(
+    "/access-redirect/{sha1}.pdf",
+    operation_id="access_redirect_pdf",
+    include_in_schema=False,
+)
+def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
+    fulltext = lookup_fulltext_pdf(sha1)
+    if not fulltext or not fulltext.access_url:
+        raise HTTPException(status_code=404, detail="PDF file not found")
+    access_url = fulltext.access_url
+    if fulltext.access_type == "wayback":
+        access_url = wayback_direct_url(access_url)
+    return RedirectResponse(access_url, status_code=302)
+
+
 web = APIRouter()
 
 
@@ -296,6 +357,33 @@ def web_search(
     )
 
 
+@web.get("/work/{work_ident}", include_in_schema=False)
+def web_work(
+    request: Request,
+    response: Response,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+    lang: LangPrefix = Depends(LangPrefix),
+    content: ContentNegotiation = Depends(ContentNegotiation),
+) -> Any:
+
+    if content.mimetype == "application/json":
+        return get_work(work_ident)
+
+    doc = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc:
+        raise HTTPException(status_code=404, detail="work not found")
+
+    return i18n_templates[lang.code].TemplateResponse(
+        "work.html",
+        {
+            "request": request,
+            "locale": lang.code,
+            "lang_prefix": lang.prefix,
+            "doc": doc,
+        },
+    )
+
+
 app = FastAPI(
     title="Fatcat Scholar",
     description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",
author	Bryan Newbold <bnewbold@archive.org>	2021-04-23 19:01:22 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2021-04-23 19:03:40 -0700
commit	e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
tree	a07ff995c7c64362c895dea3933060a832d04c38
parent	d394c39c6a1b05033029c42377fbf40603b07bbd (diff)
download	fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip