diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 00:55:44 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-01-19 19:49:04 -0800 |
commit | 89e9149b27263d6128be449c7b12b025cc031292 (patch) | |
tree | e3233ac5fc462881353be62c45217e9a4d974d4f | |
parent | 59cc64a24e4331899e6b952cc7a8dedc1ec13547 (diff) | |
download | fatcat-scholar-89e9149b27263d6128be449c7b12b025cc031292.tar.gz fatcat-scholar-89e9149b27263d6128be449c7b12b025cc031292.zip |
add citation query feature (disabled by default)
This is operationally complex (queries hit 3x backend services!), so not
enabled by default. Will need more testing; possibly circuit-breaking.
Though haproxy should provide some of that automatically at this point.
-rw-r--r-- | fatcat_scholar/search.py | 83 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 6 | ||||
-rw-r--r-- | settings.toml | 3 |
3 files changed, 75 insertions, 17 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 9bc5699..cbfc465 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -2,6 +2,7 @@ Helpers to make elasticsearch queries. """ +import copy import logging import datetime from gettext import gettext @@ -10,6 +11,7 @@ from typing import List, Optional, Any import elasticsearch from elasticsearch_dsl import Search, Q from elasticsearch_dsl.response import Response +import fatcat_openapi_client # pytype: disable=import-error from pydantic import BaseModel @@ -19,6 +21,8 @@ from pydantic import BaseModel from fatcat_scholar.config import settings from fatcat_scholar.identifiers import * from fatcat_scholar.schema import ScholarDoc +from fatcat_scholar.query_parse import sniff_citation_query, pre_parse_query +from fatcat_scholar.query_citation import try_fuzzy_match # i18n note: the use of gettext below doesn't actually do the translation here, # it just ensures that the strings are caught by babel for translation later @@ -81,6 +85,7 @@ class FulltextQuery(BaseModel): class FulltextHits(BaseModel): + query_type: str count_returned: int count_found: int offset: int @@ -206,26 +211,75 @@ def apply_filters(search: Search, query: FulltextQuery) -> Search: return search -def do_fulltext_search( - query: FulltextQuery, deep_page_limit: int = 2000 -) -> FulltextHits: +def process_query(query: FulltextQuery) -> FulltextHits: - search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) + if not query.q: + return do_fulltext_search(query) - # Try handling raw identifier queries - if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q: + # try handling raw identifier queries + if len(query.q.strip().split()) == 1 and not '"' in query.q: doi = clean_doi(query.q) if doi: - query.q = f'doi:"{doi}"' - query.filter_type = "everything" - query.filter_availability = "everything" - query.filter_time = "all_time" + return do_lookup_query(f'doi:"{doi}"') pmcid = clean_pmcid(query.q) if pmcid: - query.q = f'pmcid:"{pmcid}"' - query.filter_type = "everything" - query.filter_availability = "everything" - query.filter_time = "all_time" + return do_lookup_query(f'pmcid:"{pmcid}"') + + # if this is a citation string, do a fuzzy lookup + if settings.ENABLE_CITATION_QUERY and sniff_citation_query(query.q): + api_conf = fatcat_openapi_client.Configuration() + api_conf.host = settings.FATCAT_API_HOST + api_client = fatcat_openapi_client.DefaultApi( + fatcat_openapi_client.ApiClient(api_conf) + ) + fatcat_es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki") + key: Optional[str] = None + try: + key = try_fuzzy_match( + query.q, + grobid_host=settings.GROBID_HOST, + es_client=fatcat_es_client, + fatcat_api_client=api_client, + ) + except elasticsearch.exceptions.RequestError as e: + logging.warn(f"citation fuzzy failure: {e}") + pass + except Exception as e: + # TODO: sentry log? + logging.warn(f"citation fuzzy failure: {e}") + raise e + if key: + result = do_lookup_query(f"key:{key}") + if result: + result.query_type = "citation" + return result + + # fall through to regular query, with pre-parsing + query = copy.copy(query) + if query.q: + query.q = pre_parse_query(query.q) + + return do_fulltext_search(query) + + +def do_lookup_query(lookup: str) -> FulltextHits: + logging.info(f"lookup query: {lookup}") + query = FulltextQuery( + q=lookup, + filter_type="everything", + filter_availability="everything", + filter_time="all_time", + ) + result = do_fulltext_search(query) + result.query_type = "lookup" + return result + + +def do_fulltext_search( + query: FulltextQuery, deep_page_limit: int = 2000 +) -> FulltextHits: + + search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) if query.collapse_key: search = search.filter("term", collapse_key=query.collapse_key) @@ -349,6 +403,7 @@ def do_fulltext_search( count_found = count_returned return FulltextHits( + query_type="fulltext", count_returned=count_returned, count_found=count_found, offset=offset, diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index a54b4e4..133ddce 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -19,7 +19,7 @@ from starlette.exceptions import HTTPException as StarletteHTTPException from fatcat_scholar.config import settings, GIT_REVISION from fatcat_scholar.hacks import Jinja2Templates, parse_accept_lang -from fatcat_scholar.search import do_fulltext_search, FulltextQuery, FulltextHits +from fatcat_scholar.search import process_query, FulltextQuery, FulltextHits from fatcat_scholar.schema import ScholarDoc @@ -97,7 +97,7 @@ class HitsModel(BaseModel): async def search(query: FulltextQuery = Depends(FulltextQuery)) -> FulltextHits: hits: Optional[FulltextHits] = None try: - hits = do_fulltext_search(query) + hits = process_query(query) except ValueError as e: sentry_sdk.set_level("warning") sentry_sdk.capture_exception(e) @@ -213,7 +213,7 @@ async def web_search( status_code: int = 200 if query.q is not None: try: - hits = do_fulltext_search(query) + hits = process_query(query) except ValueError as e: sentry_sdk.set_level("warning") sentry_sdk.capture_exception(e) diff --git a/settings.toml b/settings.toml index 34f29ce..6cd8823 100644 --- a/settings.toml +++ b/settings.toml @@ -10,6 +10,7 @@ ELASTICSEARCH_FULLTEXT_INDEX = "scholar_fulltext" THUMBNAIL_URL_PREFIX = "https://blobs.fatcat.wiki/thumbnail/pdf/" SANDCRAWLER_DB_API = "http://aitio.us.archive.org:3030" SANDCRAWLER_S3_API = "wbgrp-svc169.us.archive.org:8333" +GROBID_HOST = "https://grobid.qa.fatcat.wiki" KAFKA_BROKERS = [] FATCAT_API_HOST = "https://api.fatcat.wiki/v0" INDEX_WORKER_BATCH_SIZE = 50 @@ -19,6 +20,7 @@ GOATCOUNTER_ENDPOINT = "https://goatcounter.scholar.fatcat.wiki/count" GOATCOUNTER_SCRIPT_URL = "https://goatcounter.scholar.fatcat.wiki/count.js" ONION_DOMAIN = "scholar-qa.archivev3qli37bju4rlh27glh24lljyezwxf4pokmrdbpefjlcrp5id.onion" ENABLE_PROMETHEUS = false +ENABLE_CITATION_QUERY = false [test] SCHOLAR_ENV = "test" @@ -28,6 +30,7 @@ ELASTICSEARCH_FRONTEND = "http://disabled-during-tests-bogus.xyz:3333" ELASTICSEARCH_BACKEND = "http://disabled-during-tests-bogus.xyz:3333" SANDCRAWLER_DB_API = "http://disabled-during-tests-bogus.xyz:3333" SANDCRAWLER_S3_API = "disabled-during-tests-bogus.xyz:8333" +GROBID_HOST = "http://disabled-during-tests-bogus.xyz:3333" SCHOLAR_ISSUEDB_PATH = ":memory:" [development] |