diff options
Diffstat (limited to 'fatcat_scholar/search.py')
-rw-r--r-- | fatcat_scholar/search.py | 70 |
1 files changed, 40 insertions, 30 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index d29e03b..5a61f53 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -1,4 +1,3 @@ - """ Helpers to make elasticsearch queries. """ @@ -17,6 +16,7 @@ from typing import List, Dict, Tuple, Optional, Any, Sequence # i18n note: the use of gettext below doesn't actually do the translation here, # it just ensures that the strings are caught by babel for translation later + class FulltextQuery(BaseModel): q: Optional[str] = None limit: Optional[int] = None @@ -76,31 +76,42 @@ class FulltextHits(BaseModel): offset: int limit: int deep_page_limit: int - query_time_ms: int + query_time_ms: int results: List[Any] -def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> FulltextHits: +def do_fulltext_search( + query: FulltextQuery, deep_page_limit: int = 2000 +) -> FulltextHits: es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_BACKEND) search = Search(using=es_client, index=settings.ELASTICSEARCH_FULLTEXT_INDEX) # Convert raw DOIs to DOI queries - if query.q and len(query.q.split()) == 1 and query.q.startswith("10.") and query.q.count("/") >= 1: + if ( + query.q + and len(query.q.split()) == 1 + and query.q.startswith("10.") + and query.q.count("/") >= 1 + ): search = search.filter("terms", doi=query.q) query.q = "*" # type filters if query.filter_type == "papers": - search = search.filter("terms", type=[ "article-journal", "paper-conference", "chapter", ]) + search = search.filter( + "terms", type=["article-journal", "paper-conference", "chapter",] + ) elif query.filter_type == "reports": - search = search.filter("terms", type=[ "report", "standard", ]) + search = search.filter("terms", type=["report", "standard",]) elif query.filter_type == "datasets": - search = search.filter("terms", type=[ "dataset", "software", ]) + search = search.filter("terms", type=["dataset", "software",]) elif query.filter_type == "everything" or query.filter_type == None: pass else: - raise ValueError(f"Unknown 'filter_type' parameter value: '{query.filter_type}'") + raise ValueError( + f"Unknown 'filter_type' parameter value: '{query.filter_type}'" + ) # time filters if query.filter_time == "past_week": @@ -111,7 +122,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # the later to catch papers which don't have release_date defined year_ago_date = str(datetime.date.today() - datetime.timedelta(days=365)) this_year = datetime.date.today().year - search = search.filter(Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year)) + search = search.filter( + Q("range", date=dict(gte=year_ago_date)) | Q("term", year=this_year) + ) elif query.filter_time == "since_2000": search = search.filter("range", year=dict(gte=2000)) elif query.filter_time == "before_1925": @@ -119,7 +132,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful elif query.filter_time == "all_time" or query.filter_time == None: pass else: - raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'") + raise ValueError( + f"Unknown 'filter_time' parameter value: '{query.filter_time}'" + ) # availability filters if query.filter_availability == "oa": @@ -129,13 +144,15 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful elif query.filter_availability == "fulltext" or query.filter_availability == None: search = search.filter("terms", access_type=["wayback", "ia_file", "ia_sim"]) else: - raise ValueError(f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'") + raise ValueError( + f"Unknown 'filter_availability' parameter value: '{query.filter_availability}'" + ) # we combined several queries to improve scoring. # this query use the fancy built-in query string parser basic_fulltext = Q( - 'query_string', + "query_string", query=query.q, default_operator="AND", analyze_wildcard=True, @@ -150,12 +167,9 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful "everything", ], ) - has_fulltext = Q( - 'terms', - access_type=["ia_sim", "ia_file", "wayback"], - ) + has_fulltext = Q("terms", access_type=["ia_sim", "ia_file", "wayback"],) poor_metadata = Q( - 'bool', + "bool", should=[ # if these fields aren't set, metadata is poor. The more that do # not exist, the stronger the signal. @@ -168,11 +182,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful search = search.query( "boosting", - positive=Q( - "bool", - must=basic_fulltext, - should=[has_fulltext], - ), + positive=Q("bool", must=basic_fulltext, should=[has_fulltext],), negative=poor_metadata, negative_boost=0.5, ) @@ -201,15 +211,15 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # Avoid deep paging problem. offset = deep_page_limit - search = search[offset:offset+limit] + search = search[offset : offset + limit] try: resp = search.execute() except elasticsearch.exceptions.RequestError as e: # this is a "user" error print("elasticsearch 400: " + str(e.info), file=sys.stderr) - if e.info.get('error', {}).get('root_cause', {}): - raise ValueError(str(e.info['error']['root_cause'][0].get('reason'))) + if e.info.get("error", {}).get("root_cause", {}): + raise ValueError(str(e.info["error"]["root_cause"][0].get("reason"))) else: raise ValueError(str(e.info)) except elasticsearch.exceptions.TransportError as e: @@ -221,12 +231,12 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful results = [] for h in resp: r = h._d_ - #print(json.dumps(h.meta._d_, indent=2)) - r['_highlights'] = [] - if 'highlight' in dir(h.meta): + # print(json.dumps(h.meta._d_, indent=2)) + r["_highlights"] = [] + if "highlight" in dir(h.meta): highlights = h.meta.highlight._d_ for k in highlights: - r['_highlights'] += highlights[k] + r["_highlights"] += highlights[k] results.append(r) for h in results: @@ -235,7 +245,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # "Crimes against Unicode"; production workaround for key in h: if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + h[key] = h[key].encode("utf8", "ignore").decode("utf8") return FulltextHits( count_returned=len(results), |