summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-23 19:01:22 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-23 19:03:40 -0700
commite35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
treea07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar
parentd394c39c6a1b05033029c42377fbf40603b07bbd (diff)
downloadfatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz
fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing services to pull the citation_pdf_url meta tag and bounce to a direct IA PDF access URL. For now the landing page stubs are just formatted as SERP results. Presumbably these will get re-styled at some point and include citation graph links, etc.
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/search.py38
-rw-r--r--fatcat_scholar/templates/search_macros.html2
-rw-r--r--fatcat_scholar/templates/work.html48
-rw-r--r--fatcat_scholar/web.py92
4 files changed, 176 insertions, 4 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 5571909..c5fca35 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -21,7 +21,7 @@ from pydantic import BaseModel
from fatcat_scholar.config import settings
from fatcat_scholar.identifiers import *
-from fatcat_scholar.schema import ScholarDoc
+from fatcat_scholar.schema import ScholarDoc, ScholarFulltext
from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query
from fatcat_scholar.query_citation import try_fuzzy_match
@@ -444,3 +444,39 @@ def es_scholar_index_alive() -> bool:
return bool(resp["_shards"]["successful"] == resp["_shards"]["total"])
except KeyError:
return False
+
+
+def get_es_scholar_doc(key: str) -> Optional[dict]:
+ """
+ Fetch a single document from search index, by key. Returns None if not found.
+ """
+ try:
+ resp = es_client.get(settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX, key)
+ except elasticsearch.exceptions.NotFoundError:
+ return None
+ doc = resp["_source"]
+ try:
+ doc["_obj"] = ScholarDoc.parse_obj(doc)
+ except Exception:
+ pass
+ return doc
+
+
+def lookup_fulltext_pdf(sha1: str) -> Optional[dict]:
+ """
+ Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document.
+ """
+ sha1 = sha1.lower()
+ assert len(sha1) == 40 and sha1.isalnum()
+ hits = do_lookup_query(
+ f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf"'
+ )
+ if not hits.results:
+ return None
+ fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"])
+ if not fulltext.access_type in ("ia_file", "wayback"):
+ return None
+ assert fulltext.file_sha1 == sha1
+ assert fulltext.file_mimetype == "application/pdf"
+ assert fulltext.access_url
+ return fulltext
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index 9524d7e..63c988d 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -435,7 +435,7 @@
{{ platform_access_button(paper.biblio) }}
{# ### COLLAPSED HITS #}
- {% if paper._collapsed_count > 0 %}
+ {% if paper._collapsed_count and paper._collapsed_count > 0 %}
<button class="ui basic compact blue labeled icon button serp-button" form="search_form" type="submit" name="collapse_key" value="{{ paper.collapse_key }}">
<i class="ui icon zoom-in"></i>
{% trans trimmed count=paper._collapsed_count %}
diff --git a/fatcat_scholar/templates/work.html b/fatcat_scholar/templates/work.html
new file mode 100644
index 0000000..92e334e
--- /dev/null
+++ b/fatcat_scholar/templates/work.html
@@ -0,0 +1,48 @@
+{% import "search_macros.html" as search_macros %}
+{% extends "base.html" %}
+
+{% block title %}
+{{ doc.title }}
+{% endblock %}
+
+{% block extra_head %}
+ <link rel="canonical" href="/work/{{ doc.work_ident }}">
+
+ <meta name="citation_title" content="{{ doc.biblio.title }}">
+{% for contrib in doc.biblio.contrib_names %}
+ <meta name="citation_author" content="{{ contrib }}">
+{% endfor %}
+{% if doc.biblio.release_date or doc.biblio.release_year %}
+ <meta name="citation_publication_date" content="{{ doc.biblio.release_date or doc.biblio.release_year }}">
+{% endif %}
+{% if doc.biblio.container_name %}
+ <meta name="citation_journal_title" content="{{ doc.biblio.container_name }}">
+{% endif %}
+
+{% if doc.biblio.volume %}
+ <meta name="citation_volume" content="{{ doc.biblio.volume }}">
+{% endif %}
+{% if doc.biblio.issue %}
+ <meta name="citation_issue" content="{{ doc.biblio.issue }}">
+{% endif %}
+{% if doc.biblio.pages %}
+ <meta name="citation_first_page" content="{{ doc.biblio.pages }}">
+{% endif %}
+{% if doc.biblio.doi %}
+ <meta name="citation_doi" content="{{ doc.biblio.doi }}">
+{% endif %}
+{% if doc.fulltext.access_url and doc.biblio.release_ident == doc.fulltext.release_ident and doc.fulltext.access_type in ['wayback', 'ia_file'] and doc.fulltext.file_mimetype == "application/pdf" and doc.fulltext.file_sha1 %}
+<!-- PDF access redirect URL, as requested by, eg, scholar.google.com -->
+<meta name="citation_pdf_url" content="/access-redirect/{{ doc.fulltext.file_sha1 }}.pdf">
+<!-- <meta name="citation_pdf_url" content="{{ doc.fulltext.access_url }}"> -->
+{% endif %}
+
+{% endblock %}
+
+{% block fullmain %}
+<div class="ui centered grid">
+ <div class="ui fourteen wide column serp-column" style="margin-top: 2em;">
+ {{ search_macros.fulltext_search_result_row(doc, locale=locale, debug_mode=False, expand=True) }}
+ </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index 895af18..56f2561 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -10,9 +10,14 @@ from typing import Optional, Any, List, Dict
from pydantic import BaseModel
import babel.numbers
import babel.support
-from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException
+from fastapi import FastAPI, APIRouter, Request, Depends, Response, HTTPException, Query
from fastapi.staticfiles import StaticFiles
-from fastapi.responses import PlainTextResponse, JSONResponse, FileResponse
+from fastapi.responses import (
+ PlainTextResponse,
+ JSONResponse,
+ FileResponse,
+ RedirectResponse,
+)
from fastapi.middleware.cors import CORSMiddleware
import sentry_sdk
from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
@@ -26,6 +31,8 @@ from fatcat_scholar.search import (
FulltextQuery,
FulltextHits,
es_scholar_index_alive,
+ get_es_scholar_doc,
+ lookup_fulltext_pdf,
)
from fatcat_scholar.schema import ScholarDoc
@@ -160,6 +167,60 @@ def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits:
return hits
+@api.get("/work/{work_ident}", operation_id="get_work")
+def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict:
+ doc = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc:
+ raise HTTPException(status_code=404, detail="work not found")
+ doc.pop("_obj", None)
+ return doc
+
+
+def wayback_direct_url(url: str) -> str:
+ """
+ Re-writes a wayback replay URL to add the 'id_' suffix (or equivalent for direct file access)
+ """
+ if not "://web.archive.org" in url:
+ return url
+ segments = url.split("/")
+ if len(segments) < 6 or not segments[4].isdigit():
+ return url
+ segments[4] += "id_"
+ return "/".join(segments)
+
+
+def test_wayback_direct_url() -> None:
+ assert (
+ wayback_direct_url("http://fatcat.wiki/thing.pdf")
+ == "http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ wayback_direct_url("https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf")
+ == "https://web.archive.org/web/*/http://fatcat.wiki/thing.pdf"
+ )
+ assert (
+ wayback_direct_url(
+ "https://web.archive.org/web/1234/http://fatcat.wiki/thing.pdf"
+ )
+ == "https://web.archive.org/web/1234id_/http://fatcat.wiki/thing.pdf"
+ )
+
+
+@api.get(
+ "/access-redirect/{sha1}.pdf",
+ operation_id="access_redirect_pdf",
+ include_in_schema=False,
+)
+def access_redirect_pdf(sha1: str = Query(..., min_length=40, max_length=40)) -> Any:
+ fulltext = lookup_fulltext_pdf(sha1)
+ if not fulltext or not fulltext.access_url:
+ raise HTTPException(status_code=404, detail="PDF file not found")
+ access_url = fulltext.access_url
+ if fulltext.access_type == "wayback":
+ access_url = wayback_direct_url(access_url)
+ return RedirectResponse(access_url, status_code=302)
+
+
web = APIRouter()
@@ -296,6 +357,33 @@ def web_search(
)
+@web.get("/work/{work_ident}", include_in_schema=False)
+def web_work(
+ request: Request,
+ response: Response,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+ lang: LangPrefix = Depends(LangPrefix),
+ content: ContentNegotiation = Depends(ContentNegotiation),
+) -> Any:
+
+ if content.mimetype == "application/json":
+ return get_work(work_ident)
+
+ doc = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc:
+ raise HTTPException(status_code=404, detail="work not found")
+
+ return i18n_templates[lang.code].TemplateResponse(
+ "work.html",
+ {
+ "request": request,
+ "locale": lang.code,
+ "lang_prefix": lang.prefix,
+ "doc": doc,
+ },
+ )
+
+
app = FastAPI(
title="Fatcat Scholar",
description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",