diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:01:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-04-23 19:03:40 -0700 |
commit | e35e99bceff3277afaac8f2d5519aa4f07aabe49 (patch) | |
tree | a07ff995c7c64362c895dea3933060a832d04c38 /fatcat_scholar/search.py | |
parent | d394c39c6a1b05033029c42377fbf40603b07bbd (diff) | |
download | fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.tar.gz fatcat-scholar-e35e99bceff3277afaac8f2d5519aa4f07aabe49.zip |
web: initial implementation of work landing page and citation_pdf_url access redirect
The initial intent is to have something that can be used by indexing
services to pull the citation_pdf_url meta tag and bounce to a direct
IA PDF access URL.
For now the landing page stubs are just formatted as SERP results.
Presumbably these will get re-styled at some point and include citation
graph links, etc.
Diffstat (limited to 'fatcat_scholar/search.py')
-rw-r--r-- | fatcat_scholar/search.py | 38 |
1 files changed, 37 insertions, 1 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 5571909..c5fca35 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -21,7 +21,7 @@ from pydantic import BaseModel from fatcat_scholar.config import settings from fatcat_scholar.identifiers import * -from fatcat_scholar.schema import ScholarDoc +from fatcat_scholar.schema import ScholarDoc, ScholarFulltext from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query from fatcat_scholar.query_citation import try_fuzzy_match @@ -444,3 +444,39 @@ def es_scholar_index_alive() -> bool: return bool(resp["_shards"]["successful"] == resp["_shards"]["total"]) except KeyError: return False + + +def get_es_scholar_doc(key: str) -> Optional[dict]: + """ + Fetch a single document from search index, by key. Returns None if not found. + """ + try: + resp = es_client.get(settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX, key) + except elasticsearch.exceptions.NotFoundError: + return None + doc = resp["_source"] + try: + doc["_obj"] = ScholarDoc.parse_obj(doc) + except Exception: + pass + return doc + + +def lookup_fulltext_pdf(sha1: str) -> Optional[dict]: + """ + Fetch a document by fulltext file sha1, returning only the 'fulltext' sub-document. + """ + sha1 = sha1.lower() + assert len(sha1) == 40 and sha1.isalnum() + hits = do_lookup_query( + f'fulltext.file_sha1:{sha1} fulltext.file_mimetype:"application/pdf"' + ) + if not hits.results: + return None + fulltext = ScholarFulltext.parse_obj(hits.results[0]["fulltext"]) + if not fulltext.access_type in ("ia_file", "wayback"): + return None + assert fulltext.file_sha1 == sha1 + assert fulltext.file_mimetype == "application/pdf" + assert fulltext.access_url + return fulltext |