summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-21 19:49:37 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-21 19:49:37 -0700
commitb0e77aebf00ec93d0a8bc5e948cad014c32e74d1 (patch)
treeafade65a2d8a53ece2a942c837100bb34435c7f8
parente7dbb5c3eef5a861c411c3bd058e590d04be557f (diff)
downloadfatcat-scholar-b0e77aebf00ec93d0a8bc5e948cad014c32e74d1.tar.gz
fatcat-scholar-b0e77aebf00ec93d0a8bc5e948cad014c32e74d1.zip
fix abstracts; experiment with search stemming
-rw-r--r--fatcat_scholar/search.py6
-rw-r--r--schema/README.md8
-rw-r--r--schema/scholar_fulltext.v01.json30
3 files changed, 36 insertions, 8 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 4d53667..aeb089d 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -131,10 +131,12 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
analyze_wildcard=True,
allow_leading_wildcard=False,
lenient=True,
+ quote_field_suffix=".exact",
fields=[
"title^5",
"biblio_all^3",
- "abstracts_all^2",
+ "abstracts.body^2",
+ "fulltext.body",
"everything",
],
)
@@ -165,7 +167,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful
negative_boost=0.5,
)
search = search.highlight(
- "abstracts_all",
+ "abstracts.body",
"fulltext.body",
"fulltext.annex",
number_of_fragments=2,
diff --git a/schema/README.md b/schema/README.md
new file mode 100644
index 0000000..3dec14d
--- /dev/null
+++ b/schema/README.md
@@ -0,0 +1,8 @@
+
+fatcat-scholar Elasticsearch Schema Notes
+===========================================
+
+
+### Stemming
+
+https://www.elastic.co/guide/en/elasticsearch/reference/master/mixing-exact-search-with-stemming.html
diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json
index 48fcc51..23cadf2 100644
--- a/schema/scholar_fulltext.v01.json
+++ b/schema/scholar_fulltext.v01.json
@@ -12,12 +12,24 @@
"type": "custom",
"tokenizer": "icu_tokenizer",
"char_filter": [ "icu_normalizer" ],
- "filter": [ "icu_folding" ]
+ "filter": [ "icu_folding", "stemmer" ]
},
"textIcuSearch": {
"type": "custom",
"tokenizer": "icu_tokenizer",
"char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding", "stemmer" ]
+ },
+ "textIcuExact": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ },
+ "textIcuSearchExact": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
},
@@ -61,7 +73,10 @@
"dynamic": false,
"properties": {
"release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "title": {
+ "type": "text","index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"],
+ "fields": {"exact": { "type": "text", "analyzer": "textIcuExact", "search_analyzer": "textIcuSearchExact" }}
+ },
"subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
"original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
"release_date": { "type": "date" },
@@ -91,7 +106,7 @@
"mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"license_slug": { "type": "keyword", "normalizer": "default" },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact" },
"publisher_type": { "type": "keyword", "normalizer": "default" },
"container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
"container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
@@ -101,8 +116,8 @@
"issns": { "type": "keyword", "normalizer": "default" },
"container_type": { "type": "keyword", "normalizer": "default" },
"contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact", "copy_to": ["biblio_all", "everything"] },
+ "affiliations": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact" }
}
},
@@ -192,7 +207,10 @@
},
"everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "biblio_all": {
+ "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch",
+ "fields": {"exact": { "type": "text", "analyzer": "textIcuExact", "search_analyzer": "textIcuSearchExact" }}
+ },
"doctype": { "type": "alias", "path": "doc_type" },