From 7bd3d790f29fe791915f68852086c69abb2f06bb Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 1 Apr 2020 15:28:40 -0700 Subject: elasticsearch schemas --- schema/fulltext_schema.v00.json | 134 +++++++++++++++++++++++++++++++++++++++ schema/release_schema.v03b.json | 136 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 schema/fulltext_schema.v00.json create mode 100644 schema/release_schema.v03b.json diff --git a/schema/fulltext_schema.v00.json b/schema/fulltext_schema.v00.json new file mode 100644 index 0000000..694048e --- /dev/null +++ b/schema/fulltext_schema.v00.json @@ -0,0 +1,134 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } +}, +"mappings": { + "release": { + "dynamic": false, + "_source": { + "excludes": [ + "fulltext.abstract", + "fulltext.body", + "fulltext.acknowledgment", + "fulltext.annex", + "everything" + ] + }, + "properties": { + "fatcat_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "fatcat_revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "cord19_uid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "work_id": { "type": "keyword", "normalizer": "default" }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "release_date": { "type": "date" }, + "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "language": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "contrib_count": { "type": "integer" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, + "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything" }, + "abstract_lang": { "type": "keyword", "normalizer": "default" }, + + "fulltext": { + "dynamic": false, + "properties": { + "status": { "type": "keyword", "normalizer": "default" }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "lang": { "type": "keyword", "normalizer": "default" }, + "acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + + "pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pdf_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "grobid_xml_url": { "type": "keyword", "normalizer": "default", "doc_values": false } + + } + }, + + "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "affiliation": { "type": "alias", "path": "affiliations" }, + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "year": { "type": "alias", "path": "release_year" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "lang": { "type": "alias", "path": "language" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + + "body": { "type": "alias", "path": "fulltext.body" } + + } + } +} +} diff --git a/schema/release_schema.v03b.json b/schema/release_schema.v03b.json new file mode 100644 index 0000000..666a672 --- /dev/null +++ b/schema/release_schema.v03b.json @@ -0,0 +1,136 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } +}, +"mappings": { + "release": { + "properties": { + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "state": { "type": "keyword", "normalizer": "default" }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "work_id": { "type": "keyword", "normalizer": "default" }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "release_date": { "type": "date" }, + "release_year": { "type": "integer", "copy_to": "biblio" }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "language": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "country_code_upper": { "type": "keyword", "normalizer": "caseSensitive" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": "biblio" }, + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "core_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "ark_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "contrib_count": { "type": "integer" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "biblio" }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "affiliation_rors": { "type": "keyword", "normalizer": "default" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, + "ref_count": { "type": "integer" }, + "ref_linked_count": { "type": "integer" }, + "ref_release_ids": { "type": "keyword", "normalizer": "default" }, + "file_count": { "type": "integer" }, + "fileset_count": { "type": "integer" }, + "webcapture_count": { "type": "integer" }, + "any_abstract": { "type": "boolean" }, + + "biblio": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "best_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_pdf_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "ia_microfilm_url": { "type": "keyword", "normalizer": "caseSensitive", "doc_values": false }, + "is_oa": { "type": "boolean" }, + "oa_color": { "type": "keyword", "normalizer": "default" }, + "is_longtail_oa": { "type": "boolean" }, + "is_preserved": { "type": "boolean" }, + "in_kbart": { "type": "boolean" }, + "in_jstor": { "type": "boolean" }, + "in_dweb": { "type": "boolean" }, + "in_web": { "type": "boolean" }, + "in_ia": { "type": "boolean" }, + "in_ia_sim": { "type": "boolean" }, + "in_shadows": { "type": "boolean" }, + "is_superceded": { "type": "boolean" }, + "is_retracted": { "type": "boolean" }, + "preservation": { "type": "keyword", "normalizer": "default" }, + + "affiliation": { "type": "alias", "path": "affiliations" }, + "ror": { "type": "alias", "path": "affiliation_rors" }, + "creator_id": { "type": "alias", "path": "creator_ids" }, + "ref_release_id": { "type": "alias", "path": "ref_release_ids" }, + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "year": { "type": "alias", "path": "release_year" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "oa": { "type": "alias", "path": "is_oa" }, + "longtail": { "type": "alias", "path": "is_longtail_oa" }, + "lang": { "type": "alias", "path": "language" }, + "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, + "release_status": { "type": "alias", "path": "release_stage" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "retracted": { "type": "alias", "path": "is_retracted" }, + "is_kept": { "type": "alias", "path": "in_kbart" } + } + } +} +} -- cgit v1.2.3