From b7404fb0f696807db3a92bc2c4c73c2d208e59ef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 30 Jan 2020 00:51:56 -0800 Subject: ES schemas: make keywords case-insensitive by default But not applying asciifolding; don't see any need to do so? --- extra/elasticsearch/changelog_schema.json | 20 +++++-- extra/elasticsearch/container_schema.json | 38 ++++++++----- extra/elasticsearch/file_schema.json | 34 ++++++++---- extra/elasticsearch/release_schema.json | 89 ++++++++++++++++++------------- 4 files changed, 115 insertions(+), 66 deletions(-) (limited to 'extra') diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index 77c77238..d958fed9 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -16,13 +28,13 @@ "changelog": { "properties": { "index": { "type": "integer" }, - "editgroup_id": { "type": "keyword", "doc_values": false }, + "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "timestamp": { "type": "date" }, - "editor_id": { "type": "keyword" }, - "username": { "type": "keyword" }, + "editor_id": { "type": "keyword", "normalizer": "default" }, + "username": { "type": "keyword", "normalize": "caseSensitive" }, "is_bot": { "type": "boolean" }, "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, + "agent": { "type": "keyword", "normalize": "caseSensitive" }, "containers": { "type": "integer" }, "new_containers": { "type": "integer" }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 3be261a2..be3a408e 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -20,6 +20,18 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -27,23 +39,23 @@ "mappings": { "container": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, "name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_type": { "type": "keyword" }, - "issnl": { "type": "keyword" }, - "issns": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "country": { "type": "keyword" }, - "region": { "type": "keyword" }, - "discipline": { "type": "keyword" }, - "languages": { "type": "keyword" }, - "mimetypes": { "type": "keyword" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "wikidata_qid": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "region": { "type": "keyword", "normalizer": "default" }, + "discipline": { "type": "keyword", "normalizer": "default" }, + "languages": { "type": "keyword", "normalizer": "default" }, + "mimetypes": { "type": "keyword", "normalizer": "default" }, "first_year": { "type": "integer" }, "last_year": { "type": "integer" }, @@ -57,7 +69,7 @@ "any_kbart": { "type": "boolean" }, "any_jstor": { "type": "boolean" }, "any_ia_sim": { "type": "boolean" }, - "sherpa_romeo_color": { "type": "keyword" }, + "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" }, "releases_total": { "type": "integer" }, "releases_kbart": { "type": "integer" }, diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index a0ac3346..9c8ee64c 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -8,6 +8,18 @@ "tokenizer": "standard", "filter": [ "lowercase", "asciifolding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } @@ -15,21 +27,21 @@ "mappings": { "file": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "release_ids": { "type": "keyword", "doc_values": false }, + "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false }, "release_count": { "type": "integer" }, - "mimetype": { "type": "keyword" }, + "mimetype": { "type": "keyword", "normalizer": "default" }, "size_bytes": { "type": "integer" }, - "sha1": { "type": "keyword", "doc_values": false }, - "sha256": { "type": "keyword", "doc_values": false }, - "md5": { "type": "keyword", "doc_values": false }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "md5": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "domains": { "type": "keyword" }, - "hosts": { "type": "keyword" }, - "rels": { "type": "keyword" }, + "domains": { "type": "keyword", "normalizer": "default" }, + "hosts": { "type": "keyword", "normalizer": "default" }, + "rels": { "type": "keyword", "normalizer": "default" }, "in_ia": { "type": "boolean" }, "in_ia_petabox": { "type": "boolean" }, diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 3d301dba..f983a703 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -20,58 +20,71 @@ "char_filter": [ "icu_normalizer" ], "filter": [ "icu_folding" ] } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } } } } +} }, "mappings": { "release": { "properties": { - "ident": { "type": "keyword", "doc_values": false }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword", "doc_values": false }, - "work_id": { "type": "keyword", "doc_values": false }, + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "release_date": { "type": "date" }, "release_year": { "type": "integer", "copy_to": "biblio" }, - "release_type": { "type": "keyword", "copy_to": "biblio" }, - "release_stage": { "type": "keyword" }, - "withdrawn_status": { "type": "keyword", "copy_to": "biblio" }, - "language": { "type": "keyword" }, - "country": { "type": "keyword" }, - "volume": { "type": "keyword", "copy_to": "biblio" }, - "issue": { "type": "keyword", "copy_to": "biblio" }, - "pages": { "type": "keyword", "copy_to": "biblio" }, - "first_page": { "type": "keyword" }, - "number": { "type": "keyword", "copy_to": "biblio" }, - "doi": { "type": "keyword", "doc_values": false }, - "doi_prefix": { "type": "keyword" }, - "doi_registrar": { "type": "keyword" }, - "pmid": { "type": "keyword", "doc_values": false }, - "pmcid": { "type": "keyword", "doc_values": false }, - "isbn13": { "type": "keyword", "doc_values": false }, - "wikidata_qid": { "type": "keyword", "doc_values": false }, - "core_id": { "type": "keyword", "doc_values": false }, - "axiv_id": { "type": "keyword", "doc_values": false }, - "jstor_id": { "type": "keyword", "doc_values": false }, - "ark_id": { "type": "keyword", "doc_values": false }, - "mag_id": { "type": "keyword", "doc_values": false }, - "license": { "type": "keyword" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, - "container_id": { "type": "keyword" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, "contrib_count": { "type": "integer" }, "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "affiliation_rors": { "type": "keyword" }, - "creator_ids": { "type": "keyword" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, "ref_count": { "type": "integer" }, "ref_linked_count": { "type": "integer" }, - "ref_release_ids": { "type": "keyword" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, "file_count": { "type": "integer" }, "fileset_count": { "type": "integer" }, "webcapture_count": { "type": "integer" }, @@ -79,11 +92,11 @@ "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "best_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_pdf_url": { "type": "keyword", "doc_values": false }, - "ia_microfilm_url": { "type": "keyword", "doc_values": false }, + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, "is_oa": { "type": "boolean" }, - "oa_color": { "type": "keyword" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, "is_longtail_oa": { "type": "boolean" }, "is_preserved": { "type": "boolean" }, "in_kbart": { "type": "boolean" }, @@ -95,7 +108,7 @@ "in_shadows": { "type": "boolean" }, "is_superceded": { "type": "boolean" }, "is_retracted": { "type": "boolean" }, - "preservation": { "type": "keyword" }, + "preservation": { "type": "keyword", "normalizer": "default" }, "affilation": { "type": "alias", "path": "affiliations" }, "ror": { "type": "alias", "path": "affiliation_rors" }, -- cgit v1.2.3