aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/search.py40
-rw-r--r--schema/scholar_fulltext.v01.json1
2 files changed, 36 insertions, 5 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 6842e65..4d53667 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -121,25 +121,55 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
else:
raise ValueError(f"Unknown 'filter_time' parameter value: '{query.filter_time}'")
- search = search.query(
+ # we combined several queries to improve scoring.
+
+ # this query use the fancy built-in query string parser
+ basic_fulltext = Q(
'query_string',
query=query.q,
default_operator="AND",
analyze_wildcard=True,
+ allow_leading_wildcard=False,
lenient=True,
fields=[
+ "title^5",
+ "biblio_all^3",
+ "abstracts_all^2",
"everything",
- "abstracts_all",
- "fulltext.body",
- "fulltext.annex",
],
)
+ has_fulltext = Q(
+ 'terms',
+ access_type=["ia_sim", "ia_file", "wayback"],
+ )
+ poor_metadata = Q(
+ 'bool',
+ should=[
+ # if these fields aren't set, metadata is poor. The more that do
+ # not exist, the stronger the signal.
+ Q("bool", must_not=Q("exists", field="title")),
+ Q("bool", must_not=Q("exists", field="year")),
+ Q("bool", must_not=Q("exists", field="type")),
+ Q("bool", must_not=Q("exists", field="stage")),
+ ],
+ )
+
+ search = search.query(
+ "boosting",
+ positive=Q(
+ "bool",
+ must=basic_fulltext,
+ should=[has_fulltext],
+ ),
+ negative=poor_metadata,
+ negative_boost=0.5,
+ )
search = search.highlight(
"abstracts_all",
"fulltext.body",
"fulltext.annex",
number_of_fragments=2,
- fragment_size=250,
+ fragment_size=300,
)
# sort order
diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json
index 2f2eb58..48fcc51 100644
--- a/schema/scholar_fulltext.v01.json
+++ b/schema/scholar_fulltext.v01.json
@@ -199,6 +199,7 @@
"body": { "type": "alias", "path": "fulltext.body" },
"abstract": { "type": "alias", "path": "abstracts.body" },
"acknowledgement":{ "type": "alias", "path": "fulltext.acknowledgement" },
+ "access_type": { "type": "alias", "path": "fulltext.access_type" },
"doi": { "type": "alias", "path": "releases.doi" },
"doi_prefix": { "type": "alias", "path": "releases.doi_prefix" },