From 679595a9181e56346fce7dd4e688a2cc9f1499d3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 13 May 2020 19:41:27 -0700 Subject: schema starting point (from covid19) --- schema/fulltext_schema.v01.json | 141 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 schema/fulltext_schema.v01.json diff --git a/schema/fulltext_schema.v01.json b/schema/fulltext_schema.v01.json new file mode 100644 index 0000000..b2bd289 --- /dev/null +++ b/schema/fulltext_schema.v01.json @@ -0,0 +1,141 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } +}, +"mappings": { + "_doc": { + "dynamic": false, + "_source": { + "excludes": [ + "fulltext.abstract", + "fulltext.body", + "fulltext.acknowledgment", + "fulltext.annex", + "biblio_all", + "everything" + ] + }, + "properties": { + "fatcat_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "fatcat_revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "source_tags": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "work_id": { "type": "keyword", "normalizer": "default" }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "release_date": { "type": "date" }, + "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "language": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "cord19_uid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "who_covidence_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license": { "type": "keyword", "normalizer": "default" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "container_id": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "contrib_count": { "type": "integer" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "creator_ids": { "type": "keyword", "normalizer": "default" }, + "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything" }, + "abstract_lang": { "type": "keyword", "normalizer": "default" }, + + "fulltext": { + "dynamic": false, + "properties": { + "status": { "type": "keyword", "normalizer": "default" }, + "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "lang": { "type": "keyword", "normalizer": "default" }, + "acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + + "ia_pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pdf_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "grobid_xml_url": { "type": "keyword", "normalizer": "default", "doc_values": false } + + } + }, + + "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "affiliation": { "type": "alias", "path": "affiliations" }, + "author": { "type": "alias", "path": "contrib_names" }, + "journal": { "type": "alias", "path": "container_name" }, + "date": { "type": "alias", "path": "release_date" }, + "year": { "type": "alias", "path": "release_year" }, + "issn": { "type": "alias", "path": "container_issnl" }, + "lang": { "type": "alias", "path": "language" }, + "stage": { "type": "alias", "path": "release_stage" }, + "type": { "type": "alias", "path": "release_type" }, + "country": { "type": "alias", "path": "country_code" }, + + "source": { "type": "alias", "path": "source_tags" }, + "body": { "type": "alias", "path": "fulltext.body" } + + } + } +} +} -- cgit v1.2.3