diff options
-rw-r--r-- | extra/elasticsearch/Dockerfile | 2 | ||||
-rw-r--r-- | extra/elasticsearch/README.md | 22 | ||||
-rw-r--r-- | extra/elasticsearch/changelog_schema.json | 4 | ||||
-rw-r--r-- | extra/elasticsearch/container_schema.json | 4 | ||||
-rw-r--r-- | extra/elasticsearch/fatcat_schema.json | 109 | ||||
-rw-r--r-- | extra/elasticsearch/file_schema.json | 4 | ||||
-rw-r--r-- | extra/elasticsearch/release_schema.json | 4 |
7 files changed, 24 insertions, 125 deletions
diff --git a/extra/elasticsearch/Dockerfile b/extra/elasticsearch/Dockerfile index 13d641a4..c82b5f1e 100644 --- a/extra/elasticsearch/Dockerfile +++ b/extra/elasticsearch/Dockerfile @@ -1,4 +1,4 @@ -FROM docker.elastic.co/elasticsearch/elasticsearch:6.4.2 +FROM docker.elastic.co/elasticsearch/elasticsearch:7.10.1 RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch analysis-icu diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 17865bc0..196ac588 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -42,26 +42,26 @@ Drop and rebuild the schema: http delete :9200/fatcat_container http delete :9200/fatcat_file http delete :9200/fatcat_changelog - http put :9200/fatcat_release < release_schema.json - http put :9200/fatcat_container < container_schema.json - http put :9200/fatcat_file < file_schema.json - http put :9200/fatcat_changelog < changelog_schema.json + http put :9200/fatcat_release?include_type_name=true < release_schema.json + http put :9200/fatcat_container?include_type_name=true < container_schema.json + http put :9200/fatcat_file?include_type_name=true < file_schema.json + http put :9200/fatcat_changelog?include_type_name=true < changelog_schema.json Put a single object (good for debugging): - head -n1 examples.json | http post :9200/fatcat_release/release/0 - http get :9200/fatcat_release/release/0 + head -n1 examples.json | http post :9200/fatcat_release/_doc/0 + http get :9200/fatcat_release/_doc/0 Bulk insert from a file on disk: - esbulk -verbose -id ident -index fatcat_release -type release examples.json + esbulk -verbose -id ident -index fatcat_release -type _doc examples.json Or, in a bulk production live-stream conversion: export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type release - time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type container - time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type file + time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type _doc + time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type _doc + time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type _doc ## Index Aliases @@ -94,7 +94,7 @@ To do an atomic swap from one alias to a new one ("zero downtime"): A generic full-text "query string" query look like this (replace "blood" with actual query string, and "size" field with the max results to return): - GET /fatcat_release/release/_search + GET /fatcat_release/_search { "query": { "query_string": { diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index d8342549..6e784a57 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -1,6 +1,8 @@ { "settings": { "index": { + "number_of_shards": 2, + "number_of_replicas": 0, "analysis": { "analyzer": { "default": { @@ -25,7 +27,7 @@ } }, "mappings": { - "changelog": { + "_doc": { "properties": { "index": { "type": "integer" }, "editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 5cd85b04..1960984d 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -1,6 +1,8 @@ { "settings": { "index": { + "number_of_shards": 1, + "number_of_replicas": 0, "analysis": { "analyzer": { "default": { @@ -37,7 +39,7 @@ } }, "mappings": { - "container": { + "_doc": { "properties": { "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, "state": { "type": "keyword", "normalizer": "default" }, diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json deleted file mode 100644 index 05583330..00000000 --- a/extra/elasticsearch/fatcat_schema.json +++ /dev/null @@ -1,109 +0,0 @@ -{ -"settings": { - "index": { - "analysis": { - "analyzer": { - "default": { - "type": "custom", - "tokenizer": "standard", - "filter": [ "lowercase", "asciifolding" ] - }, - "textIcu": { - "type": "custom", - "tokenizer": "icu_tokenizer", - "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] - }, - "textIcuSearch": { - "type": "custom", - "tokenizer": "icu_tokenizer", - "char_filter": [ "icu_normalizer" ], - "filter": [ "icu_folding" ] - } - } - } - } -}, -"mappings": { - "release": { - "properties": { - "ident": { "type": "keyword" }, - "state": { "type": "keyword" }, - "revision": { "type": "keyword" }, - "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "release_date": { "type": "date" }, - "release_year": { "type": "integer" }, - "release_type": { "type": "keyword" }, - "release_status": { "type": "keyword" }, - "language": { "type": "keyword" }, - "doi": { "type": "keyword" }, - "pmid": { "type": "keyword" }, - "pmcid": { "type": "keyword" }, - "isbn13": { "type": "keyword" }, - "wikidata_qid": { "type": "keyword" }, - "core_id": { "type": "keyword" }, - "axiv_id": { "type": "keyword" }, - "jstor_id": { "type": "keyword" }, - "license": { "type": "keyword" }, - "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "container_issnl": { "type": "keyword" }, - "container_type": { "type": "keyword" }, - "contrib_count": { "type": "integer" }, - "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, - "ref_count": { "type": "integer" }, - "file_count": { "type": "integer" }, - "fileset_count": { "type": "integer" }, - "webcapture_count": { "type": "integer" }, - "any_abstract": { "type": "boolean" }, - - "best_pdf_url": { "type": "keyword" }, - "ia_pdf_url": { "type": "keyword" }, - "is_oa": { "type": "boolean" }, - "is_longtail_oa": { "type": "boolean" }, - "is_preserved": { "type": "boolean" }, - "in_kbart": { "type": "boolean" }, - "in_jstor": { "type": "boolean" }, - "in_dweb": { "type": "boolean" }, - "in_web": { "type": "boolean" }, - "in_ia": { "type": "boolean" }, - "in_ia_sim": { "type": "boolean" }, - "in_shadows": { "type": "boolean" }, - - "author": { "type": "alias", "path": "contrib_names" }, - "journal": { "type": "alias", "path": "container_name" }, - "date": { "type": "alias", "path": "release_date" }, - "year": { "type": "alias", "path": "release_year" }, - "issn": { "type": "alias", "path": "container_issnl" }, - "oa": { "type": "alias", "path": "is_oa" }, - "longtail": { "type": "alias", "path": "is_longtail_oa" }, - "lang": { "type": "alias", "path": "language" }, - "file_pdf_url": { "type": "alias", "path": "best_pdf_url" }, - "is_kept": { "type": "alias", "path": "in_kbart" } - } - }, - "changelog": { - "properties": { - "index": { "type": "integer" }, - "editgorup_id": { "type": "keyword" }, - "timestamp": { "type": "date" }, - "username": { "type": "keyword" }, - "is_bot": { "type": "boolean" }, - "is_admin": { "type": "boolean" }, - "agent": { "type": "keyword" }, - "containers": { "type": "integer" }, - "creators": { "type": "integer" }, - "files": { "type": "integer" }, - "filessets": { "type": "integer" }, - "webcaptures": { "type": "integer" }, - "releases": { "type": "integer" }, - "works": { "type": "integer" }, - "created": { "type": "integer" }, - "updated": { "type": "integer" }, - "deleted": { "type": "integer" }, - "total": { "type": "integer" } - } - } -} -} diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 0fa25c3a..4635e469 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -1,6 +1,8 @@ { "settings": { "index": { + "number_of_shards": 6, + "number_of_replicas": 0, "analysis": { "analyzer": { "default": { @@ -25,7 +27,7 @@ } }, "mappings": { - "file": { + "_doc": { "properties": { "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, "state": { "type": "keyword", "normalizer": "default" }, diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index e1f7a79a..91f2f183 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -2,6 +2,8 @@ "settings": { "index": { "analysis": { + "number_of_shards": 6, + "number_of_replicas": 0, "analyzer": { "default": { "type": "custom", @@ -37,7 +39,7 @@ } }, "mappings": { - "release": { + "_doc": { "properties": { "ident": { "type": "keyword", "normalizer": "default", "doc_values": false }, "state": { "type": "keyword", "normalizer": "default" }, |