diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 19:49:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 19:49:37 -0700 |
commit | b0e77aebf00ec93d0a8bc5e948cad014c32e74d1 (patch) | |
tree | afade65a2d8a53ece2a942c837100bb34435c7f8 | |
parent | e7dbb5c3eef5a861c411c3bd058e590d04be557f (diff) | |
download | fatcat-scholar-b0e77aebf00ec93d0a8bc5e948cad014c32e74d1.tar.gz fatcat-scholar-b0e77aebf00ec93d0a8bc5e948cad014c32e74d1.zip |
fix abstracts; experiment with search stemming
-rw-r--r-- | fatcat_scholar/search.py | 6 | ||||
-rw-r--r-- | schema/README.md | 8 | ||||
-rw-r--r-- | schema/scholar_fulltext.v01.json | 30 |
3 files changed, 36 insertions, 8 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 4d53667..aeb089d 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -131,10 +131,12 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful analyze_wildcard=True, allow_leading_wildcard=False, lenient=True, + quote_field_suffix=".exact", fields=[ "title^5", "biblio_all^3", - "abstracts_all^2", + "abstracts.body^2", + "fulltext.body", "everything", ], ) @@ -165,7 +167,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful negative_boost=0.5, ) search = search.highlight( - "abstracts_all", + "abstracts.body", "fulltext.body", "fulltext.annex", number_of_fragments=2, diff --git a/schema/README.md b/schema/README.md new file mode 100644 index 0000000..3dec14d --- /dev/null +++ b/schema/README.md @@ -0,0 +1,8 @@ + +fatcat-scholar Elasticsearch Schema Notes +=========================================== + + +### Stemming + +https://www.elastic.co/guide/en/elasticsearch/reference/master/mixing-exact-search-with-stemming.html diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json index 48fcc51..23cadf2 100644 --- a/schema/scholar_fulltext.v01.json +++ b/schema/scholar_fulltext.v01.json @@ -12,12 +12,24 @@ "type": "custom", "tokenizer": "icu_tokenizer", "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] + "filter": [ "icu_folding", "stemmer" ] }, "textIcuSearch": { "type": "custom", "tokenizer": "icu_tokenizer", "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding", "stemmer" ] + }, + "textIcuExact": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearchExact": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } }, @@ -61,7 +73,10 @@ "dynamic": false, "properties": { "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "title": { + "type": "text","index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"], + "fields": {"exact": { "type": "text", "analyzer": "textIcuExact", "search_analyzer": "textIcuSearchExact" }} + }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "release_date": { "type": "date" }, @@ -91,7 +106,7 @@ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "license_slug": { "type": "keyword", "normalizer": "default" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact" }, "publisher_type": { "type": "keyword", "normalizer": "default" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, @@ -101,8 +116,8 @@ "issns": { "type": "keyword", "normalizer": "default" }, "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" } + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact", "copy_to": ["biblio_all", "everything"] }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact" } } }, @@ -192,7 +207,10 @@ }, "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "biblio_all": { + "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", + "fields": {"exact": { "type": "text", "analyzer": "textIcuExact", "search_analyzer": "textIcuSearchExact" }} + }, "doctype": { "type": "alias", "path": "doc_type" }, |