summaryrefslogtreecommitdiffstats
path: root/extra/elasticsearch
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:51:56 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-30 00:52:01 -0800
commitb7404fb0f696807db3a92bc2c4c73c2d208e59ef (patch)
treee5b74f29fd183427543789ec512a31c6978518e8 /extra/elasticsearch
parent59912583926077260d99a9bf77a938c2215eb6c8 (diff)
downloadfatcat-b7404fb0f696807db3a92bc2c4c73c2d208e59ef.tar.gz
fatcat-b7404fb0f696807db3a92bc2c4c73c2d208e59ef.zip
ES schemas: make keywords case-insensitive by default
But not applying asciifolding; don't see any need to do so?
Diffstat (limited to 'extra/elasticsearch')
-rw-r--r--extra/elasticsearch/changelog_schema.json20
-rw-r--r--extra/elasticsearch/container_schema.json38
-rw-r--r--extra/elasticsearch/file_schema.json34
-rw-r--r--extra/elasticsearch/release_schema.json89
4 files changed, 115 insertions, 66 deletions
diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index 77c77238..d958fed9 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -8,6 +8,18 @@
"tokenizer": "standard",
"filter": [ "lowercase", "asciifolding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -16,13 +28,13 @@
"changelog": {
"properties": {
"index": { "type": "integer" },
- "editgroup_id": { "type": "keyword", "doc_values": false },
+ "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"timestamp": { "type": "date" },
- "editor_id": { "type": "keyword" },
- "username": { "type": "keyword" },
+ "editor_id": { "type": "keyword", "normalizer": "default" },
+ "username": { "type": "keyword", "normalize": "caseSensitive" },
"is_bot": { "type": "boolean" },
"is_admin": { "type": "boolean" },
- "agent": { "type": "keyword" },
+ "agent": { "type": "keyword", "normalize": "caseSensitive" },
"containers": { "type": "integer" },
"new_containers": { "type": "integer" },
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index 3be261a2..be3a408e 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -20,6 +20,18 @@
"char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -27,23 +39,23 @@
"mappings": {
"container": {
"properties": {
- "ident": { "type": "keyword", "doc_values": false },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword", "doc_values": false },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
"name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"abbrev": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"aliases": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
- "container_type": { "type": "keyword" },
- "issnl": { "type": "keyword" },
- "issns": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "country": { "type": "keyword" },
- "region": { "type": "keyword" },
- "discipline": { "type": "keyword" },
- "languages": { "type": "keyword" },
- "mimetypes": { "type": "keyword" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
+ "issnl": { "type": "keyword", "normalizer": "default" },
+ "issns": { "type": "keyword", "normalizer": "default" },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default" },
+ "country": { "type": "keyword", "normalizer": "default" },
+ "region": { "type": "keyword", "normalizer": "default" },
+ "discipline": { "type": "keyword", "normalizer": "default" },
+ "languages": { "type": "keyword", "normalizer": "default" },
+ "mimetypes": { "type": "keyword", "normalizer": "default" },
"first_year": { "type": "integer" },
"last_year": { "type": "integer" },
@@ -57,7 +69,7 @@
"any_kbart": { "type": "boolean" },
"any_jstor": { "type": "boolean" },
"any_ia_sim": { "type": "boolean" },
- "sherpa_romeo_color": { "type": "keyword" },
+ "sherpa_romeo_color": { "type": "keyword", "normalizer": "default" },
"releases_total": { "type": "integer" },
"releases_kbart": { "type": "integer" },
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index a0ac3346..9c8ee64c 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -8,6 +8,18 @@
"tokenizer": "standard",
"filter": [ "lowercase", "asciifolding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
@@ -15,21 +27,21 @@
"mappings": {
"file": {
"properties": {
- "ident": { "type": "keyword", "doc_values": false },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword", "doc_values": false },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "release_ids": { "type": "keyword", "doc_values": false },
+ "release_ids": { "type": "keyword", "normalizer": "default", "doc_values": false },
"release_count": { "type": "integer" },
- "mimetype": { "type": "keyword" },
+ "mimetype": { "type": "keyword", "normalizer": "default" },
"size_bytes": { "type": "integer" },
- "sha1": { "type": "keyword", "doc_values": false },
- "sha256": { "type": "keyword", "doc_values": false },
- "md5": { "type": "keyword", "doc_values": false },
+ "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "sha256": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "md5": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "domains": { "type": "keyword" },
- "hosts": { "type": "keyword" },
- "rels": { "type": "keyword" },
+ "domains": { "type": "keyword", "normalizer": "default" },
+ "hosts": { "type": "keyword", "normalizer": "default" },
+ "rels": { "type": "keyword", "normalizer": "default" },
"in_ia": { "type": "boolean" },
"in_ia_petabox": { "type": "boolean" },
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 3d301dba..f983a703 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -20,58 +20,71 @@
"char_filter": [ "icu_normalizer" ],
"filter": [ "icu_folding" ]
}
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
}
}
}
+}
},
"mappings": {
"release": {
"properties": {
- "ident": { "type": "keyword", "doc_values": false },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword", "doc_values": false },
- "work_id": { "type": "keyword", "doc_values": false },
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "state": { "type": "keyword", "normalizer": "default" },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "work_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
"title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"release_date": { "type": "date" },
"release_year": { "type": "integer", "copy_to": "biblio" },
- "release_type": { "type": "keyword", "copy_to": "biblio" },
- "release_stage": { "type": "keyword" },
- "withdrawn_status": { "type": "keyword", "copy_to": "biblio" },
- "language": { "type": "keyword" },
- "country": { "type": "keyword" },
- "volume": { "type": "keyword", "copy_to": "biblio" },
- "issue": { "type": "keyword", "copy_to": "biblio" },
- "pages": { "type": "keyword", "copy_to": "biblio" },
- "first_page": { "type": "keyword" },
- "number": { "type": "keyword", "copy_to": "biblio" },
- "doi": { "type": "keyword", "doc_values": false },
- "doi_prefix": { "type": "keyword" },
- "doi_registrar": { "type": "keyword" },
- "pmid": { "type": "keyword", "doc_values": false },
- "pmcid": { "type": "keyword", "doc_values": false },
- "isbn13": { "type": "keyword", "doc_values": false },
- "wikidata_qid": { "type": "keyword", "doc_values": false },
- "core_id": { "type": "keyword", "doc_values": false },
- "axiv_id": { "type": "keyword", "doc_values": false },
- "jstor_id": { "type": "keyword", "doc_values": false },
- "ark_id": { "type": "keyword", "doc_values": false },
- "mag_id": { "type": "keyword", "doc_values": false },
- "license": { "type": "keyword" },
+ "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "release_stage": { "type": "keyword", "normalizer": "default" },
+ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "language": { "type": "keyword", "normalizer": "default" },
+ "country": { "type": "keyword", "normalizer": "default" },
+ "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "first_page": { "type": "keyword", "normalizer": "default" },
+ "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" },
+ "doi": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "doi_prefix": { "type": "keyword", "normalizer": "default" },
+ "doi_registrar": { "type": "keyword", "normalizer": "default" },
+ "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "axiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "license": { "type": "keyword", "normalizer": "default" },
"publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
"container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
- "container_id": { "type": "keyword" },
- "container_issnl": { "type": "keyword" },
- "container_type": { "type": "keyword" },
+ "container_id": { "type": "keyword", "normalizer": "default" },
+ "container_issnl": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
"contrib_count": { "type": "integer" },
"contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" },
"affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "affiliation_rors": { "type": "keyword" },
- "creator_ids": { "type": "keyword" },
+ "affiliation_rors": { "type": "keyword", "normalizer": "default" },
+ "creator_ids": { "type": "keyword", "normalizer": "default" },
"ref_count": { "type": "integer" },
"ref_linked_count": { "type": "integer" },
- "ref_release_ids": { "type": "keyword" },
+ "ref_release_ids": { "type": "keyword", "normalizer": "default" },
"file_count": { "type": "integer" },
"fileset_count": { "type": "integer" },
"webcapture_count": { "type": "integer" },
@@ -79,11 +92,11 @@
"biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "best_pdf_url": { "type": "keyword", "doc_values": false },
- "ia_pdf_url": { "type": "keyword", "doc_values": false },
- "ia_microfilm_url": { "type": "keyword", "doc_values": false },
+ "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
+ "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
+ "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false },
"is_oa": { "type": "boolean" },
- "oa_color": { "type": "keyword" },
+ "oa_color": { "type": "keyword", "normalizer": "default" },
"is_longtail_oa": { "type": "boolean" },
"is_preserved": { "type": "boolean" },
"in_kbart": { "type": "boolean" },
@@ -95,7 +108,7 @@
"in_shadows": { "type": "boolean" },
"is_superceded": { "type": "boolean" },
"is_retracted": { "type": "boolean" },
- "preservation": { "type": "keyword" },
+ "preservation": { "type": "keyword", "normalizer": "default" },
"affilation": { "type": "alias", "path": "affiliations" },
"ror": { "type": "alias", "path": "affiliation_rors" },