summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/search.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-21 18:56:14 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-21 18:56:14 -0700
commite7dbb5c3eef5a861c411c3bd058e590d04be557f (patch)
treef6b8594a124f9d0385decf25481a127fd62f19ac /fatcat_scholar/search.py
parent3ba3839ecd7924dc2f25295754d7a257c2542b23 (diff)
downloadfatcat-scholar-e7dbb5c3eef5a861c411c3bd058e590d04be557f.tar.gz
fatcat-scholar-e7dbb5c3eef5a861c411c3bd058e590d04be557f.zip
first pass improving search scoring
Diffstat (limited to 'fatcat_scholar/search.py')
-rw-r--r--fatcat_scholar/search.py40
1 files changed, 35 insertions, 5 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 6842e65..4d53667 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -121,25 +121,55 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
else:
raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'")
- search = search.query(
+ # we combined several queries to improve scoring.
+
+ # this query use the fancy built-in query string parser
+ basic_fulltext = Q(
'query_string',
query=query.q,
default_operator="AND",
analyze_wildcard=True,
+ allow_leading_wildcard=False,
lenient=True,
fields=[
+ "title^5",
+ "biblio_all^3",
+ "abstracts_all^2",
"everything",
- "abstracts_all",
- "fulltext.body",
- "fulltext.annex",
],
)
+ has_fulltext = Q(
+ 'terms',
+ access_type=["ia_sim", "ia_file", "wayback"],
+ )
+ poor_metadata = Q(
+ 'bool',
+ should=[
+ # if these fields aren't set, metadata is poor. The more that do
+ # not exist, the stronger the signal.
+ Q("bool", must_not=Q("exists", field="title")),
+ Q("bool", must_not=Q("exists", field="year")),
+ Q("bool", must_not=Q("exists", field="type")),
+ Q("bool", must_not=Q("exists", field="stage")),
+ ],
+ )
+
+ search = search.query(
+ "boosting",
+ positive=Q(
+ "bool",
+ must=basic_fulltext,
+ should=[has_fulltext],
+ ),
+ negative=poor_metadata,
+ negative_boost=0.5,
+ )
search = search.highlight(
"abstracts_all",
"fulltext.body",
"fulltext.annex",
number_of_fragments=2,
- fragment_size=250,
+ fragment_size=300,
)
# sort order