diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 18:56:14 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 18:56:14 -0700 |
commit | e7dbb5c3eef5a861c411c3bd058e590d04be557f (patch) | |
tree | f6b8594a124f9d0385decf25481a127fd62f19ac | |
parent | 3ba3839ecd7924dc2f25295754d7a257c2542b23 (diff) | |
download | fatcat-scholar-e7dbb5c3eef5a861c411c3bd058e590d04be557f.tar.gz fatcat-scholar-e7dbb5c3eef5a861c411c3bd058e590d04be557f.zip |
first pass improving search scoring
-rw-r--r-- | fatcat_scholar/search.py | 40 | ||||
-rw-r--r-- | schema/scholar_fulltext.v01.json | 1 |
2 files changed, 36 insertions, 5 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 6842e65..4d53667 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -121,25 +121,55 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful else: raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'") - search = search.query( + # we combined several queries to improve scoring. + + # this query use the fancy built-in query string parser + basic_fulltext = Q( 'query_string', query=query.q, default_operator="AND", analyze_wildcard=True, + allow_leading_wildcard=False, lenient=True, fields=[ + "title^5", + "biblio_all^3", + "abstracts_all^2", "everything", - "abstracts_all", - "fulltext.body", - "fulltext.annex", ], ) + has_fulltext = Q( + 'terms', + access_type=["ia_sim", "ia_file", "wayback"], + ) + poor_metadata = Q( + 'bool', + should=[ + # if these fields aren't set, metadata is poor. The more that do + # not exist, the stronger the signal. + Q("bool", must_not=Q("exists", field="title")), + Q("bool", must_not=Q("exists", field="year")), + Q("bool", must_not=Q("exists", field="type")), + Q("bool", must_not=Q("exists", field="stage")), + ], + ) + + search = search.query( + "boosting", + positive=Q( + "bool", + must=basic_fulltext, + should=[has_fulltext], + ), + negative=poor_metadata, + negative_boost=0.5, + ) search = search.highlight( "abstracts_all", "fulltext.body", "fulltext.annex", number_of_fragments=2, - fragment_size=250, + fragment_size=300, ) # sort order diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json index 2f2eb58..48fcc51 100644 --- a/schema/scholar_fulltext.v01.json +++ b/schema/scholar_fulltext.v01.json @@ -199,6 +199,7 @@ "body": { "type": "alias", "path": "fulltext.body" }, "abstract": { "type": "alias", "path": "abstracts.body" }, "acknowledgement":{ "type": "alias", "path": "fulltext.acknowledgement" }, + "access_type": { "type": "alias", "path": "fulltext.access_type" }, "doi": { "type": "alias", "path": "releases.doi" }, "doi_prefix": { "type": "alias", "path": "releases.doi_prefix" }, |