diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 18:56:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 18:56:14 -0700 |
commit | e7dbb5c3eef5a861c411c3bd058e590d04be557f (patch) | |
tree | f6b8594a124f9d0385decf25481a127fd62f19ac /fatcat_scholar/search.py | |
parent | 3ba3839ecd7924dc2f25295754d7a257c2542b23 (diff) | |
download | fatcat-scholar-e7dbb5c3eef5a861c411c3bd058e590d04be557f.tar.gz fatcat-scholar-e7dbb5c3eef5a861c411c3bd058e590d04be557f.zip |
first pass improving search scoring
Diffstat (limited to 'fatcat_scholar/search.py')
-rw-r--r-- | fatcat_scholar/search.py | 40 |
1 files changed, 35 insertions, 5 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 6842e65..4d53667 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -121,25 +121,55 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful else: raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'") - search = search.query( + # we combined several queries to improve scoring. + + # this query use the fancy built-in query string parser + basic_fulltext = Q( 'query_string', query=query.q, default_operator="AND", analyze_wildcard=True, + allow_leading_wildcard=False, lenient=True, fields=[ + "title^5", + "biblio_all^3", + "abstracts_all^2", "everything", - "abstracts_all", - "fulltext.body", - "fulltext.annex", ], ) + has_fulltext = Q( + 'terms', + access_type=["ia_sim", "ia_file", "wayback"], + ) + poor_metadata = Q( + 'bool', + should=[ + # if these fields aren't set, metadata is poor. The more that do + # not exist, the stronger the signal. + Q("bool", must_not=Q("exists", field="title")), + Q("bool", must_not=Q("exists", field="year")), + Q("bool", must_not=Q("exists", field="type")), + Q("bool", must_not=Q("exists", field="stage")), + ], + ) + + search = search.query( + "boosting", + positive=Q( + "bool", + must=basic_fulltext, + should=[has_fulltext], + ), + negative=poor_metadata, + negative_boost=0.5, + ) search = search.highlight( "abstracts_all", "fulltext.body", "fulltext.annex", number_of_fragments=2, - fragment_size=250, + fragment_size=300, ) # sort order |