From 4df616706146fce16dbc1fdc3b5502abd13144df Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 14 May 2020 23:09:17 -0700 Subject: first pass at scholar_fulltext schema --- schema/fulltext_schema.v01.json | 141 ------------------------- schema/scholar_fulltext.v01.json | 216 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 141 deletions(-) delete mode 100644 schema/fulltext_schema.v01.json create mode 100644 schema/scholar_fulltext.v01.json diff --git a/schema/fulltext_schema.v01.json b/schema/fulltext_schema.v01.json deleted file mode 100644 index b2bd289..0000000 --- a/schema/fulltext_schema.v01.json +++ /dev/null @@ -1,141 +0,0 @@ -{ -"settings": { - "index": { - "analysis": { - "analyzer": { - "default": { - "type": "custom", - "tokenizer": "standard", - "filter": [ "lowercase", "asciifolding" ] - }, - "textIcu": { - "type": "custom", - "tokenizer": "icu_tokenizer", - "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] - }, - "textIcuSearch": { - "type": "custom", - "tokenizer": "icu_tokenizer", - "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] - } - }, - "normalizer": { - "default": { - "type": "custom", - "char_filter": [], - "filter": ["lowercase"] - }, - "caseSensitive": { - "type": "custom", - "char_filter": [], - "filter": [] - } - } - } - } -}, -"mappings": { - "_doc": { - "dynamic": false, - "_source": { - "excludes": [ - "fulltext.abstract", - "fulltext.body", - "fulltext.acknowledgment", - "fulltext.annex", - "biblio_all", - "everything" - ] - }, - "properties": { - "fatcat_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "fatcat_revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "source_tags": { "type": "keyword", "normalizer": "default", "doc_values": false }, - - "work_id": { "type": "keyword", "normalizer": "default" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "release_date": { "type": "date" }, - "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] }, - "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, - "release_stage": { "type": "keyword", "normalizer": "default" }, - "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, - "language": { "type": "keyword", "normalizer": "default" }, - "country_code": { "type": "keyword", "normalizer": "default" }, - "volume": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, - "issue": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, - "pages": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, - "first_page": { "type": "keyword", "normalizer": "default" }, - "number": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, - "doi": { "type": "keyword", "normalizer": "default" }, - "doi_prefix": { "type": "keyword", "normalizer": "default" }, - "doi_registrar": { "type": "keyword", "normalizer": "default" }, - "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "cord19_uid": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "who_covidence_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "license": { "type": "keyword", "normalizer": "default" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "publisher_type": { "type": "keyword", "normalizer": "default" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "container_id": { "type": "keyword", "normalizer": "default" }, - "container_issnl": { "type": "keyword", "normalizer": "default" }, - "container_type": { "type": "keyword", "normalizer": "default" }, - "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, - "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "creator_ids": { "type": "keyword", "normalizer": "default" }, - "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything" }, - "abstract_lang": { "type": "keyword", "normalizer": "default" }, - - "fulltext": { - "dynamic": false, - "properties": { - "status": { "type": "keyword", "normalizer": "default" }, - "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, - "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, - "lang": { "type": "keyword", "normalizer": "default" }, - "acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, - "annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, - - "ia_pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "pdf_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, - "grobid_xml_url": { "type": "keyword", "normalizer": "default", "doc_values": false } - - } - }, - - "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - - "affiliation": { "type": "alias", "path": "affiliations" }, - "author": { "type": "alias", "path": "contrib_names" }, - "journal": { "type": "alias", "path": "container_name" }, - "date": { "type": "alias", "path": "release_date" }, - "year": { "type": "alias", "path": "release_year" }, - "issn": { "type": "alias", "path": "container_issnl" }, - "lang": { "type": "alias", "path": "language" }, - "stage": { "type": "alias", "path": "release_stage" }, - "type": { "type": "alias", "path": "release_type" }, - "country": { "type": "alias", "path": "country_code" }, - - "source": { "type": "alias", "path": "source_tags" }, - "body": { "type": "alias", "path": "fulltext.body" } - - } - } -} -} diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json new file mode 100644 index 0000000..613ca1e --- /dev/null +++ b/schema/scholar_fulltext.v01.json @@ -0,0 +1,216 @@ +{ +"settings": { + "index": { + "analysis": { + "analyzer": { + "default": { + "type": "custom", + "tokenizer": "standard", + "filter": [ "lowercase", "asciifolding" ] + }, + "textIcu": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + }, + "textIcuSearch": { + "type": "custom", + "tokenizer": "icu_tokenizer", + "char_filter": [ "icu_normalizer" ], + "filter": [ "icu_folding" ] + } + }, + "normalizer": { + "default": { + "type": "custom", + "char_filter": [], + "filter": ["lowercase"] + }, + "caseSensitive": { + "type": "custom", + "char_filter": [], + "filter": [] + } + } + } + } +}, +"mappings": { + "_doc": { + "dynamic": false, + "_source": { + "excludes": [ + "abstracts.body", + "fulltext.body", + "fulltext.acknowledgment", + "fulltext.annex", + "biblio_all", + "everything" + ] + }, + "properties": { + + "key": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "doc_type": { "type": "keyword", "normalizer": "default" }, + "doc_index_ts": { "type": "timestamp" }, + "work_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "tags": { "type": "keyword", "normalizer": "default" }, + + "biblio": { + "type": "object", + "dynamic": false, + "properties": { + "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "release_date": { "type": "date" }, + "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "lang_code": { "type": "keyword", "normalizer": "default" }, + "country_code": { "type": "keyword", "normalizer": "default" }, + "volume": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "volume_int": { "type": "integer" }, + "issue": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "issue_int": { "type": "integer" }, + "pages": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "first_page_int": { "type": "integer" }, + "number": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "license_slug": { "type": "keyword", "normalizer": "default" }, + "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "publisher_type": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "container_ident": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "issns": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" }, + "contrib_count": { "type": "integer" }, + "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" } + } + }, + + "fulltext": { + "type": "object", + "dynamic": false, + "properties": { + "lang_code": { "type": "keyword", "normalizer": "default" }, + "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "file_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "file_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "file_mimetype": { "type": "keyword", "normalizer": "default" }, + "thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "access_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "access_type": { "type": "keyword", "normalizer": "default" } + } + }, + + "ia_sim": { + "type": "object", + "dynamic": false, + "properties": { + "ia_item": { "type": "keyword", "normalizer": "default" }, + "ia_collection": { "type": "keyword", "normalizer": "default" }, + "first_page": { "type": "keyword", "normalizer": "default" }, + "pub_id": { "type": "keyword", "normalizer": "default" } + } + }, + + "abstracts": { + "type": "nested", + "dynamic": false, + "properties": { + "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true }, + "lang_code": { "type": "keyword", "normalizer": "default" } + } + }, + + "releases": { + "type": "nested", + "dynamic": false, + "properties": { + + "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "revision": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "release_date": { "type": "date" }, + "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] }, + "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + "release_stage": { "type": "keyword", "normalizer": "default" }, + "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] }, + + "doi": { "type": "keyword", "normalizer": "default" }, + "doi_prefix": { "type": "keyword", "normalizer": "default" }, + "doi_registrar": { "type": "keyword", "normalizer": "default" }, + "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "license_slug": { "type": "keyword", "normalizer": "default" }, + "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] }, + "container_ident": { "type": "keyword", "normalizer": "default" }, + "container_issnl": { "type": "keyword", "normalizer": "default" }, + "container_type": { "type": "keyword", "normalizer": "default" } + } + }, + + "access": { + "type": "nested", + "dynamic": false, + "properties": { + "access_type": { "type": "keyword", "normalizer": "default" }, + "access_url": { "type": "keyword", "normalizer": "default", "doc_values": false }, + + "mimetype": { "type": "keyword", "normalizer": "default" }, + "file_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, + "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false } + } + }, + + "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, + + "body": { "type": "alias", "path": "fulltext.body" }, + "abstract": { "type": "alias", "path": "abstracts.body" }, + "acknowledgement":{ "type": "alias", "path": "fulltext.acknowledgement" }, + + "tag": { "type": "alias", "path": "tags" }, + "affiliation": { "type": "alias", "path": "biblio.affiliations" }, + "author": { "type": "alias", "path": "biblio.contrib_names" }, + "journal": { "type": "alias", "path": "biblio.container_name" }, + "date": { "type": "alias", "path": "biblio.release_date" }, + "year": { "type": "alias", "path": "biblio.release_year" }, + "issn": { "type": "alias", "path": "biblio.issns" }, + "lang": { "type": "alias", "path": "biblio.lang_code" }, + "stage": { "type": "alias", "path": "biblio.release_stage" }, + "type": { "type": "alias", "path": "biblio.release_type" }, + "country": { "type": "alias", "path": "biblio.country_code" } + + } + } +} +} -- cgit v1.2.3