aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/search.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-04-23 19:01:22 -0700
committerBryan Newbold <bnewbold@archive.org>2021-04-23 19:03:40 -0700
commite35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch)
treea07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar/search.py
parentd394c39c6a1b05033029c42377fbf40603b07bbd (diff)
downloadfatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz
fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing services to pull the citation_pdf_url meta tag and bounce to a direct IA PDF access URL. For now the landing page stubs are just formatted as SERP results. Presumbably these will get re-styled at some point and include citation graph links, etc.
Diffstat (limited to 'fatcat_scholar/search.py')
-rw-r--r--fatcat_scholar/search.py38
1 files changed, 37 insertions, 1 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 5571909..c5fca35 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -21,7 +21,7 @@ from pydantic import BaseModel
from fatcat_scholar.config import settings
from fatcat_scholar.identifiers import *
-from fatcat_scholar.schema import ScholarDoc
+from fatcat_scholar.schema import ScholarDoc, ScholarFulltext
from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query
from fatcat_scholar.query_citation import try_fuzzy_match
@@ -444,3 +444,39 @@ def es_scholar_index_alive() -> bool:
return bool(resp["_shards"]["successful"] == resp["_shards"]["total"])
except KeyError:
return False
+
+
+def get_es_scholar_doc(key: str) -> Optional[dict]:
+ """
+ Fetch a single document from search index, by key. Returns None if not found.
+ """
+ try:
+ resp = es_client.get(settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX, key)
+ except elasticsearch.exceptions.NotFoundError:
+ return None
+ doc = resp["_source"]
+ try:
+ doc["_obj"] = ScholarDoc.parse_obj(doc)
+ except Exception:
+ pass
+ return doc
+
+
+def lookup_fulltext_pdf(sha1: str) -> Optional[dict]:
+ """
+ Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document.
+ """
+ sha1 = sha1.lower()
+ assert len(sha1) == 40 and sha1.isalnum()
+ hits = do_lookup_query(
+ f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf"'
+ )
+ if not hits.results:
+ return None
+ fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"])
+ if not fulltext.access_type in ("ia_file", "wayback"):
+ return None
+ assert fulltext.file_sha1 == sha1
+ assert fulltext.file_mimetype == "application/pdf"
+ assert fulltext.access_url
+ return fulltext