diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 19:49:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-05-21 19:49:37 -0700 |
commit | b0e77aebf00ec93d0a8bc5e948cad014c32e74d1 (patch) | |
tree | afade65a2d8a53ece2a942c837100bb34435c7f8 /schema | |
parent | e7dbb5c3eef5a861c411c3bd058e590d04be557f (diff) | |
download | fatcat-scholar-b0e77aebf00ec93d0a8bc5e948cad014c32e74d1.tar.gz fatcat-scholar-b0e77aebf00ec93d0a8bc5e948cad014c32e74d1.zip |
fix abstracts; experiment with search stemming
Diffstat (limited to 'schema')
-rw-r--r-- | schema/README.md | 8 | ||||
-rw-r--r-- | schema/scholar_fulltext.v01.json | 30 |
2 files changed, 32 insertions, 6 deletions
diff --git a/schema/README.md b/schema/README.md new file mode 100644 index 0000000..3dec14d --- /dev/null +++ b/schema/README.md @@ -0,0 +1,8 @@ + +fatcat-scholar Elasticsearch Schema Notes +=========================================== + + +### Stemming + +https://www.elastic.co/guide/en/elasticsearch/reference/master/mixing-exact-search-with-stemming.html diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json index 48fcc51..23cadf2 100644 --- a/schema/scholar_fulltext.v01.json +++ b/schema/scholar_fulltext.v01.json @@ -12,12 +12,24 @@ "type": "custom", "tokenizer": "icu_tokenizer", "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] + "filter": [ "icu_folding", "stemmer" ] }, "textIcuSearch": { "type": "custom", "tokenizer": "icu_tokenizer", "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding", "stemmer" ] + }, + "textIcuExact": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearchExact": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } }, @@ -61,7 +73,10 @@ "dynamic": false, "properties": { "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "title": { + "type": "text","index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"], + "fields": {"exact": { "type": "text", "analyzer": "textIcuExact", "search_analyzer": "textIcuSearchExact" }} + }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "release_date": { "type": "date" }, @@ -91,7 +106,7 @@ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "license_slug": { "type": "keyword", "normalizer": "default" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact" }, "publisher_type": { "type": "keyword", "normalizer": "default" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, @@ -101,8 +116,8 @@ "issns": { "type": "keyword", "normalizer": "default" }, "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" } + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact", "copy_to": ["biblio_all", "everything"] }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcuExact", "search_analyzer":"textIcuSearchExact" } } }, @@ -192,7 +207,10 @@ }, "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "biblio_all": { + "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", + "fields": {"exact": { "type": "text", "analyzer": "textIcuExact", "search_analyzer": "textIcuSearchExact" }} + }, "doctype": { "type": "alias", "path": "doc_type" }, |