diff options
-rw-r--r-- | fatcat_scholar/search.py | 122 |
1 files changed, 70 insertions, 52 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index c15ed93..ecbae07 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -9,6 +9,7 @@ from typing import List, Optional, Any import elasticsearch from elasticsearch_dsl import Search, Q +from elasticsearch_dsl.response import Response # pytype: disable=import-error from pydantic import BaseModel @@ -93,27 +94,49 @@ class FulltextHits(BaseModel): es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND, timeout=25.0) -def do_fulltext_search( - query: FulltextQuery, deep_page_limit: int = 2000 -) -> FulltextHits: +def transform_es_results(resp: Response) -> List[dict]: + # convert from ES objects to python dicts + results = [] + for h in resp: + r = h._d_ + # print(h.meta._d_) + r["_highlights"] = [] + if "highlight" in dir(h.meta): + highlights = h.meta.highlight._d_ + for k in highlights: + r["_highlights"] += highlights[k] + r["_collapsed"] = [] + r["_collapsed_count"] = 0 + if "inner_hits" in dir(h.meta): + if isinstance(h.meta.inner_hits.more_pages.hits.total, int): + r["_collapsed_count"] = h.meta.inner_hits.more_pages.hits.total - 1 + else: + r["_collapsed_count"] = ( + h.meta.inner_hits.more_pages.hits.total["value"] - 1 + ) + for k in h.meta.inner_hits.more_pages: + if k["key"] != r["key"]: + r["_collapsed"].append(k) + results.append(r) - search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) + for h in results: + # Handle surrogate strings that elasticsearch returns sometimes, + # probably due to mangled data processing in some pipeline. + # "Crimes against Unicode"; production workaround + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode("utf8", "ignore").decode("utf8") + # ensure collapse_key is a single value, not an array + if type(h["collapse_key"]) == list: + h["collapse_key"] = h["collapse_key"][0] + + return results - # Try handling raw identifier queries - if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q: - doi = clean_doi(query.q) - if doi: - query.q = f'doi:"{doi}"' - query.filter_type = "everything" - query.filter_availability = "everything" - query.filter_time = "all_time" - pmcid = clean_pmcid(query.q) - if pmcid: - query.q = f'pmcid:"{pmcid}"' - query.filter_type = "everything" - query.filter_availability = "everything" - query.filter_time = "all_time" +def apply_filters(search: Search, query: FulltextQuery) -> Search: + """ + Applies query filters to ES Search object based on query + """ # type filters if query.filter_type == "papers" or query.filter_type is None: search = search.filter( @@ -175,6 +198,30 @@ def do_fulltext_search( f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'" ) + return search + + +def do_fulltext_search( + query: FulltextQuery, deep_page_limit: int = 2000 +) -> FulltextHits: + + search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) + + # Try handling raw identifier queries + if query.q and len(query.q.strip().split()) == 1 and not '"' in query.q: + doi = clean_doi(query.q) + if doi: + query.q = f'doi:"{doi}"' + query.filter_type = "everything" + query.filter_availability = "everything" + query.filter_time = "all_time" + pmcid = clean_pmcid(query.q) + if pmcid: + query.q = f'pmcid:"{pmcid}"' + query.filter_type = "everything" + query.filter_availability = "everything" + query.filter_time = "all_time" + if query.collapse_key: search = search.filter("term", collapse_key=query.collapse_key) else: @@ -185,6 +232,9 @@ def do_fulltext_search( } ) + # apply filters from query + search = apply_filters(search, query) + # we combined several queries to improve scoring. # this query use the fancy built-in query string parser @@ -277,40 +327,8 @@ def do_fulltext_search( raise IOError(str(e.info)) query_delta = datetime.datetime.now() - query_start - # convert from objects to python dicts - results = [] - for h in resp: - r = h._d_ - # print(h.meta._d_) - r["_highlights"] = [] - if "highlight" in dir(h.meta): - highlights = h.meta.highlight._d_ - for k in highlights: - r["_highlights"] += highlights[k] - r["_collapsed"] = [] - r["_collapsed_count"] = 0 - if "inner_hits" in dir(h.meta): - if isinstance(h.meta.inner_hits.more_pages.hits.total, int): - r["_collapsed_count"] = h.meta.inner_hits.more_pages.hits.total - 1 - else: - r["_collapsed_count"] = ( - h.meta.inner_hits.more_pages.hits.total["value"] - 1 - ) - for k in h.meta.inner_hits.more_pages: - if k["key"] != r["key"]: - r["_collapsed"].append(k) - results.append(r) - - for h in results: - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode("utf8", "ignore").decode("utf8") - # ensure collapse_key is a single value, not an array - if type(h["collapse_key"]) == list: - h["collapse_key"] = h["collapse_key"][0] + # convert from API objects to dicts + results = transform_es_results(resp) count_found: int = 0 if isinstance(resp.hits.total, int): |