aboutsummaryrefslogtreecommitdiffstats
path: root/schema
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-05-14 23:09:17 -0700
committerBryan Newbold <bnewbold@archive.org>2020-05-14 23:10:04 -0700
commit4df616706146fce16dbc1fdc3b5502abd13144df (patch)
treee6ef41b0c5ef8d1e49d44467710197232368d33f /schema
parent679595a9181e56346fce7dd4e688a2cc9f1499d3 (diff)
downloadfatcat-scholar-4df616706146fce16dbc1fdc3b5502abd13144df.tar.gz
fatcat-scholar-4df616706146fce16dbc1fdc3b5502abd13144df.zip
first pass at scholar_fulltext schema
Diffstat (limited to 'schema')
-rw-r--r--schema/fulltext_schema.v01.json141
-rw-r--r--schema/scholar_fulltext.v01.json216
2 files changed, 216 insertions, 141 deletions
diff --git a/schema/fulltext_schema.v01.json b/schema/fulltext_schema.v01.json
deleted file mode 100644
index b2bd289..0000000
--- a/schema/fulltext_schema.v01.json
+++ /dev/null
@@ -1,141 +0,0 @@
-{
-"settings": {
- "index": {
- "analysis": {
- "analyzer": {
- "default": {
- "type": "custom",
- "tokenizer": "standard",
- "filter": [ "lowercase", "asciifolding" ]
- },
- "textIcu": {
- "type": "custom",
- "tokenizer": "icu_tokenizer",
- "char_filter": [ "icu_normalizer" ],
- "filter": [ "icu_folding" ]
- },
- "textIcuSearch": {
- "type": "custom",
- "tokenizer": "icu_tokenizer",
- "char_filter": [ "icu_normalizer" ],
- "filter": [ "icu_folding" ]
- }
- },
- "normalizer": {
- "default": {
- "type": "custom",
- "char_filter": [],
- "filter": ["lowercase"]
- },
- "caseSensitive": {
- "type": "custom",
- "char_filter": [],
- "filter": []
- }
- }
- }
- }
-},
-"mappings": {
- "_doc": {
- "dynamic": false,
- "_source": {
- "excludes": [
- "fulltext.abstract",
- "fulltext.body",
- "fulltext.acknowledgment",
- "fulltext.annex",
- "biblio_all",
- "everything"
- ]
- },
- "properties": {
- "fatcat_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "fatcat_revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "source_tags": { "type": "keyword", "normalizer": "default", "doc_values": false },
-
- "work_id": { "type": "keyword", "normalizer": "default" },
- "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "release_date": { "type": "date" },
- "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] },
- "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
- "release_stage": { "type": "keyword", "normalizer": "default" },
- "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
- "language": { "type": "keyword", "normalizer": "default" },
- "country_code": { "type": "keyword", "normalizer": "default" },
- "volume": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
- "issue": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
- "pages": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
- "first_page": { "type": "keyword", "normalizer": "default" },
- "number": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
- "doi": { "type": "keyword", "normalizer": "default" },
- "doi_prefix": { "type": "keyword", "normalizer": "default" },
- "doi_registrar": { "type": "keyword", "normalizer": "default" },
- "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "cord19_uid": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "who_covidence_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "license": { "type": "keyword", "normalizer": "default" },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "publisher_type": { "type": "keyword", "normalizer": "default" },
- "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "container_id": { "type": "keyword", "normalizer": "default" },
- "container_issnl": { "type": "keyword", "normalizer": "default" },
- "container_type": { "type": "keyword", "normalizer": "default" },
- "contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
- "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "creator_ids": { "type": "keyword", "normalizer": "default" },
- "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything" },
- "abstract_lang": { "type": "keyword", "normalizer": "default" },
-
- "fulltext": {
- "dynamic": false,
- "properties": {
- "status": { "type": "keyword", "normalizer": "default" },
- "sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "abstract": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
- "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
- "lang": { "type": "keyword", "normalizer": "default" },
- "acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
- "annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
-
- "ia_pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "pdf_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "pdf_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
- "grobid_xml_url": { "type": "keyword", "normalizer": "default", "doc_values": false }
-
- }
- },
-
- "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
-
- "affiliation": { "type": "alias", "path": "affiliations" },
- "author": { "type": "alias", "path": "contrib_names" },
- "journal": { "type": "alias", "path": "container_name" },
- "date": { "type": "alias", "path": "release_date" },
- "year": { "type": "alias", "path": "release_year" },
- "issn": { "type": "alias", "path": "container_issnl" },
- "lang": { "type": "alias", "path": "language" },
- "stage": { "type": "alias", "path": "release_stage" },
- "type": { "type": "alias", "path": "release_type" },
- "country": { "type": "alias", "path": "country_code" },
-
- "source": { "type": "alias", "path": "source_tags" },
- "body": { "type": "alias", "path": "fulltext.body" }
-
- }
- }
-}
-}
diff --git a/schema/scholar_fulltext.v01.json b/schema/scholar_fulltext.v01.json
new file mode 100644
index 0000000..613ca1e
--- /dev/null
+++ b/schema/scholar_fulltext.v01.json
@@ -0,0 +1,216 @@
+{
+"settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [ "lowercase", "asciifolding" ]
+ },
+ "textIcu": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ },
+ "textIcuSearch": {
+ "type": "custom",
+ "tokenizer": "icu_tokenizer",
+ "char_filter": [ "icu_normalizer" ],
+ "filter": [ "icu_folding" ]
+ }
+ },
+ "normalizer": {
+ "default": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": ["lowercase"]
+ },
+ "caseSensitive": {
+ "type": "custom",
+ "char_filter": [],
+ "filter": []
+ }
+ }
+ }
+ }
+},
+"mappings": {
+ "_doc": {
+ "dynamic": false,
+ "_source": {
+ "excludes": [
+ "abstracts.body",
+ "fulltext.body",
+ "fulltext.acknowledgment",
+ "fulltext.annex",
+ "biblio_all",
+ "everything"
+ ]
+ },
+ "properties": {
+
+ "key": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "doc_type": { "type": "keyword", "normalizer": "default" },
+ "doc_index_ts": { "type": "timestamp" },
+ "work_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "tags": { "type": "keyword", "normalizer": "default" },
+
+ "biblio": {
+ "type": "object",
+ "dynamic": false,
+ "properties": {
+ "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "subtitle": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "release_date": { "type": "date" },
+ "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] },
+ "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "release_stage": { "type": "keyword", "normalizer": "default" },
+ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "lang_code": { "type": "keyword", "normalizer": "default" },
+ "country_code": { "type": "keyword", "normalizer": "default" },
+ "volume": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "volume_int": { "type": "integer" },
+ "issue": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "issue_int": { "type": "integer" },
+ "pages": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "first_page": { "type": "keyword", "normalizer": "default" },
+ "first_page_int": { "type": "integer" },
+ "number": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "doi": { "type": "keyword", "normalizer": "default" },
+ "doi_prefix": { "type": "keyword", "normalizer": "default" },
+ "doi_registrar": { "type": "keyword", "normalizer": "default" },
+ "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "license_slug": { "type": "keyword", "normalizer": "default" },
+ "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "publisher_type": { "type": "keyword", "normalizer": "default" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "container_original_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "container_ident": { "type": "keyword", "normalizer": "default" },
+ "container_issnl": { "type": "keyword", "normalizer": "default" },
+ "issns": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" },
+ "contrib_count": { "type": "integer" },
+ "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "affiliations": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }
+ }
+ },
+
+ "fulltext": {
+ "type": "object",
+ "dynamic": false,
+ "properties": {
+ "lang_code": { "type": "keyword", "normalizer": "default" },
+ "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
+ "acknowledgement": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
+ "annex": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
+ "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "file_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "file_sha1": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "file_mimetype": { "type": "keyword", "normalizer": "default" },
+ "thumbnail_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "access_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "access_type": { "type": "keyword", "normalizer": "default" }
+ }
+ },
+
+ "ia_sim": {
+ "type": "object",
+ "dynamic": false,
+ "properties": {
+ "ia_item": { "type": "keyword", "normalizer": "default" },
+ "ia_collection": { "type": "keyword", "normalizer": "default" },
+ "first_page": { "type": "keyword", "normalizer": "default" },
+ "pub_id": { "type": "keyword", "normalizer": "default" }
+ }
+ },
+
+ "abstracts": {
+ "type": "nested",
+ "dynamic": false,
+ "properties": {
+ "body": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": "everything", "store": true },
+ "lang_code": { "type": "keyword", "normalizer": "default" }
+ }
+ },
+
+ "releases": {
+ "type": "nested",
+ "dynamic": false,
+ "properties": {
+
+ "ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "revision": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "release_date": { "type": "date" },
+ "release_year": { "type": "integer", "copy_to": ["biblio_all", "everything"] },
+ "release_type": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+ "release_stage": { "type": "keyword", "normalizer": "default" },
+ "withdrawn_status": { "type": "keyword", "normalizer": "default", "copy_to": ["biblio_all", "everything"] },
+
+ "doi": { "type": "keyword", "normalizer": "default" },
+ "doi_prefix": { "type": "keyword", "normalizer": "default" },
+ "doi_registrar": { "type": "keyword", "normalizer": "default" },
+ "pmid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "pmcid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "isbn13": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "wikidata_qid": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "arxiv_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "jstor_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "mag_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "s2_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
+
+ "license_slug": { "type": "keyword", "normalizer": "default" },
+ "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch", "copy_to": ["biblio_all", "everything"] },
+ "container_ident": { "type": "keyword", "normalizer": "default" },
+ "container_issnl": { "type": "keyword", "normalizer": "default" },
+ "container_type": { "type": "keyword", "normalizer": "default" }
+ }
+ },
+
+ "access": {
+ "type": "nested",
+ "dynamic": false,
+ "properties": {
+ "access_type": { "type": "keyword", "normalizer": "default" },
+ "access_url": { "type": "keyword", "normalizer": "default", "doc_values": false },
+
+ "mimetype": { "type": "keyword", "normalizer": "default" },
+ "file_ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
+ "release_ident": { "type": "keyword", "normalizer": "default", "doc_values": false }
+ }
+ },
+
+ "everything": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+ "biblio_all": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
+
+ "body": { "type": "alias", "path": "fulltext.body" },
+ "abstract": { "type": "alias", "path": "abstracts.body" },
+ "acknowledgement":{ "type": "alias", "path": "fulltext.acknowledgement" },
+
+ "tag": { "type": "alias", "path": "tags" },
+ "affiliation": { "type": "alias", "path": "biblio.affiliations" },
+ "author": { "type": "alias", "path": "biblio.contrib_names" },
+ "journal": { "type": "alias", "path": "biblio.container_name" },
+ "date": { "type": "alias", "path": "biblio.release_date" },
+ "year": { "type": "alias", "path": "biblio.release_year" },
+ "issn": { "type": "alias", "path": "biblio.issns" },
+ "lang": { "type": "alias", "path": "biblio.lang_code" },
+ "stage": { "type": "alias", "path": "biblio.release_stage" },
+ "type": { "type": "alias", "path": "biblio.release_type" },
+ "country": { "type": "alias", "path": "biblio.country_code" }
+
+ }
+ }
+}
+}