aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/elasticsearch/Dockerfile2
-rw-r--r--extra/elasticsearch/README.md22
-rw-r--r--extra/elasticsearch/changelog_schema.json4
-rw-r--r--extra/elasticsearch/container_schema.json4
-rw-r--r--extra/elasticsearch/fatcat_schema.json109
-rw-r--r--extra/elasticsearch/file_schema.json4
-rw-r--r--extra/elasticsearch/release_schema.json4
7 files changed, 24 insertions, 125 deletions
diff --git a/extra/elasticsearch/Dockerfile b/extra/elasticsearch/Dockerfile
index 13d641a4..c82b5f1e 100644
--- a/extra/elasticsearch/Dockerfile
+++ b/extra/elasticsearch/Dockerfile
@@ -1,4 +1,4 @@
-FROM docker.elastic.co/elasticsearch/elasticsearch:6.4.2
+FROM docker.elastic.co/elasticsearch/elasticsearch:7.10.1
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch analysis-icu
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 17865bc0..196ac588 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -42,26 +42,26 @@ Drop and rebuild the schema:
http delete :9200/fatcat_container
http delete :9200/fatcat_file
http delete :9200/fatcat_changelog
- http put :9200/fatcat_release < release_schema.json
- http put :9200/fatcat_container < container_schema.json
- http put :9200/fatcat_file < file_schema.json
- http put :9200/fatcat_changelog < changelog_schema.json
+ http put :9200/fatcat_release?include_type_name=true < release_schema.json
+ http put :9200/fatcat_container?include_type_name=true < container_schema.json
+ http put :9200/fatcat_file?include_type_name=true < file_schema.json
+ http put :9200/fatcat_changelog?include_type_name=true < changelog_schema.json
Put a single object (good for debugging):
- head -n1 examples.json | http post :9200/fatcat_release/release/0
- http get :9200/fatcat_release/release/0
+ head -n1 examples.json | http post :9200/fatcat_release/_doc/0
+ http get :9200/fatcat_release/_doc/0
Bulk insert from a file on disk:
- esbulk -verbose -id ident -index fatcat_release -type release examples.json
+ esbulk -verbose -id ident -index fatcat_release -type _doc examples.json
Or, in a bulk production live-stream conversion:
export LC_ALL=C.UTF-8
- time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type release
- time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type container
- time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type file
+ time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type _doc
+ time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type _doc
+ time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type _doc
## Index Aliases
@@ -94,7 +94,7 @@ To do an atomic swap from one alias to a new one ("zero downtime"):
A generic full-text "query string" query look like this (replace "blood" with
actual query string, and "size" field with the max results to return):
- GET /fatcat_release/release/_search
+ GET /fatcat_release/_search
{
"query": {
"query_string": {
diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index d8342549..6e784a57 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -1,6 +1,8 @@
{
"settings": {
"index": {
+ "number_of_shards": 2,
+ "number_of_replicas": 0,
"analysis": {
"analyzer": {
"default": {
@@ -25,7 +27,7 @@
}
},
"mappings": {
- "changelog": {
+ "_doc": {
"properties": {
"index": { "type": "integer" },
"editgroup_id": { "type": "keyword", "normalizer": "default", "doc_values": false },
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json
index 5cd85b04..1960984d 100644
--- a/extra/elasticsearch/container_schema.json
+++ b/extra/elasticsearch/container_schema.json
@@ -1,6 +1,8 @@
{
"settings": {
"index": {
+ "number_of_shards": 1,
+ "number_of_replicas": 0,
"analysis": {
"analyzer": {
"default": {
@@ -37,7 +39,7 @@
}
},
"mappings": {
- "container": {
+ "_doc": {
"properties": {
"ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
"state": { "type": "keyword", "normalizer": "default" },
diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json
deleted file mode 100644
index 05583330..00000000
--- a/extra/elasticsearch/fatcat_schema.json
+++ /dev/null
@@ -1,109 +0,0 @@
-{
-"settings": {
- "index": {
- "analysis": {
- "analyzer": {
- "default": {
- "type": "custom",
- "tokenizer": "standard",
- "filter": [ "lowercase", "asciifolding" ]
- },
- "textIcu": {
- "type": "custom",
- "tokenizer": "icu_tokenizer",
- "char_filter": [ "icu_normalizer" ],
- "filter": [ "icu_folding" ]
- },
- "textIcuSearch": {
- "type": "custom",
- "tokenizer": "icu_tokenizer",
- "char_filter": [ "icu_normalizer" ],
- "filter": [ "icu_folding" ]
- }
- }
- }
- }
-},
-"mappings": {
- "release": {
- "properties": {
- "ident": { "type": "keyword" },
- "state": { "type": "keyword" },
- "revision": { "type": "keyword" },
- "title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "release_date": { "type": "date" },
- "release_year": { "type": "integer" },
- "release_type": { "type": "keyword" },
- "release_status": { "type": "keyword" },
- "language": { "type": "keyword" },
- "doi": { "type": "keyword" },
- "pmid": { "type": "keyword" },
- "pmcid": { "type": "keyword" },
- "isbn13": { "type": "keyword" },
- "wikidata_qid": { "type": "keyword" },
- "core_id": { "type": "keyword" },
- "axiv_id": { "type": "keyword" },
- "jstor_id": { "type": "keyword" },
- "license": { "type": "keyword" },
- "publisher": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_name": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "container_issnl": { "type": "keyword" },
- "container_type": { "type": "keyword" },
- "contrib_count": { "type": "integer" },
- "contrib_names": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" },
- "ref_count": { "type": "integer" },
- "file_count": { "type": "integer" },
- "fileset_count": { "type": "integer" },
- "webcapture_count": { "type": "integer" },
- "any_abstract": { "type": "boolean" },
-
- "best_pdf_url": { "type": "keyword" },
- "ia_pdf_url": { "type": "keyword" },
- "is_oa": { "type": "boolean" },
- "is_longtail_oa": { "type": "boolean" },
- "is_preserved": { "type": "boolean" },
- "in_kbart": { "type": "boolean" },
- "in_jstor": { "type": "boolean" },
- "in_dweb": { "type": "boolean" },
- "in_web": { "type": "boolean" },
- "in_ia": { "type": "boolean" },
- "in_ia_sim": { "type": "boolean" },
- "in_shadows": { "type": "boolean" },
-
- "author": { "type": "alias", "path": "contrib_names" },
- "journal": { "type": "alias", "path": "container_name" },
- "date": { "type": "alias", "path": "release_date" },
- "year": { "type": "alias", "path": "release_year" },
- "issn": { "type": "alias", "path": "container_issnl" },
- "oa": { "type": "alias", "path": "is_oa" },
- "longtail": { "type": "alias", "path": "is_longtail_oa" },
- "lang": { "type": "alias", "path": "language" },
- "file_pdf_url": { "type": "alias", "path": "best_pdf_url" },
- "is_kept": { "type": "alias", "path": "in_kbart" }
- }
- },
- "changelog": {
- "properties": {
- "index": { "type": "integer" },
- "editgorup_id": { "type": "keyword" },
- "timestamp": { "type": "date" },
- "username": { "type": "keyword" },
- "is_bot": { "type": "boolean" },
- "is_admin": { "type": "boolean" },
- "agent": { "type": "keyword" },
- "containers": { "type": "integer" },
- "creators": { "type": "integer" },
- "files": { "type": "integer" },
- "filessets": { "type": "integer" },
- "webcaptures": { "type": "integer" },
- "releases": { "type": "integer" },
- "works": { "type": "integer" },
- "created": { "type": "integer" },
- "updated": { "type": "integer" },
- "deleted": { "type": "integer" },
- "total": { "type": "integer" }
- }
- }
-}
-}
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
index 0fa25c3a..4635e469 100644
--- a/extra/elasticsearch/file_schema.json
+++ b/extra/elasticsearch/file_schema.json
@@ -1,6 +1,8 @@
{
"settings": {
"index": {
+ "number_of_shards": 6,
+ "number_of_replicas": 0,
"analysis": {
"analyzer": {
"default": {
@@ -25,7 +27,7 @@
}
},
"mappings": {
- "file": {
+ "_doc": {
"properties": {
"ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
"state": { "type": "keyword", "normalizer": "default" },
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index e1f7a79a..91f2f183 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -2,6 +2,8 @@
"settings": {
"index": {
"analysis": {
+ "number_of_shards": 6,
+ "number_of_replicas": 0,
"analyzer": {
"default": {
"type": "custom",
@@ -37,7 +39,7 @@
}
},
"mappings": {
- "release": {
+ "_doc": {
"properties": {
"ident": { "type": "keyword", "normalizer": "default", "doc_values": false },
"state": { "type": "keyword", "normalizer": "default" },