diff options
Diffstat (limited to 'extra/elasticsearch')
| -rw-r--r-- | extra/elasticsearch/Dockerfile | 2 | ||||
| -rw-r--r-- | extra/elasticsearch/README.md | 22 | ||||
| -rw-r--r-- | extra/elasticsearch/changelog_schema.json | 4 | ||||
| -rw-r--r-- | extra/elasticsearch/container_schema.json | 4 | ||||
| -rw-r--r-- | extra/elasticsearch/fatcat_schema.json | 109 | ||||
| -rw-r--r-- | extra/elasticsearch/file_schema.json | 4 | ||||
| -rw-r--r-- | extra/elasticsearch/release_schema.json | 4 | 
7 files changed, 24 insertions, 125 deletions
diff --git a/extra/elasticsearch/Dockerfile b/extra/elasticsearch/Dockerfile index 13d641a4..c82b5f1e 100644 --- a/extra/elasticsearch/Dockerfile +++ b/extra/elasticsearch/Dockerfile @@ -1,4 +1,4 @@ -FROM docker.elastic.co/elasticsearch/elasticsearch:6.4.2 +FROM docker.elastic.co/elasticsearch/elasticsearch:7.10.1  RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch analysis-icu diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 17865bc0..196ac588 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -42,26 +42,26 @@ Drop and rebuild the schema:      http delete :9200/fatcat_container      http delete :9200/fatcat_file      http delete :9200/fatcat_changelog -    http put :9200/fatcat_release < release_schema.json -    http put :9200/fatcat_container < container_schema.json -    http put :9200/fatcat_file < file_schema.json -    http put :9200/fatcat_changelog < changelog_schema.json +    http put :9200/fatcat_release?include_type_name=true < release_schema.json +    http put :9200/fatcat_container?include_type_name=true < container_schema.json +    http put :9200/fatcat_file?include_type_name=true < file_schema.json +    http put :9200/fatcat_changelog?include_type_name=true < changelog_schema.json  Put a single object (good for debugging): -    head -n1 examples.json | http post :9200/fatcat_release/release/0 -    http get :9200/fatcat_release/release/0 +    head -n1 examples.json | http post :9200/fatcat_release/_doc/0 +    http get :9200/fatcat_release/_doc/0  Bulk insert from a file on disk: -    esbulk -verbose -id ident -index fatcat_release -type release examples.json +    esbulk -verbose -id ident -index fatcat_release -type _doc examples.json  Or, in a bulk production live-stream conversion:      export LC_ALL=C.UTF-8 -    time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type release -    time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type container -    time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type file +    time zcat /srv/fatcat/snapshots/release_export_expanded.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release -type _doc +    time zcat /srv/fatcat/snapshots/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container -type _doc +    time zcat /srv/fatcat/snapshots/file_export.json.gz | pv -l | parallel -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file -type _doc  ## Index Aliases @@ -94,7 +94,7 @@ To do an atomic swap from one alias to a new one ("zero downtime"):  A generic full-text "query string" query look like this (replace "blood" with  actual query string, and "size" field with the max results to return): -    GET /fatcat_release/release/_search +    GET /fatcat_release/_search      {        "query": {          "query_string": { diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index d8342549..6e784a57 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -1,6 +1,8 @@  {  "settings": {      "index": { +        "number_of_shards": 2, +        "number_of_replicas": 0,          "analysis": {              "analyzer": {                  "default": { @@ -25,7 +27,7 @@      }  },  "mappings": { -    "changelog": { +    "_doc": {          "properties": {              "index":            { "type": "integer" },              "editgroup_id":     { "type": "keyword", "normalizer": "default", "doc_values": false }, diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 5cd85b04..1960984d 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -1,6 +1,8 @@  {  "settings": {      "index": { +        "number_of_shards": 1, +        "number_of_replicas": 0,          "analysis": {              "analyzer": {                  "default": { @@ -37,7 +39,7 @@      }  },  "mappings": { -    "container": { +    "_doc": {          "properties": {              "ident":          { "type": "keyword", "normalizer": "default", "doc_values": false },              "state":          { "type": "keyword", "normalizer": "default" }, diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json deleted file mode 100644 index 05583330..00000000 --- a/extra/elasticsearch/fatcat_schema.json +++ /dev/null @@ -1,109 +0,0 @@ -{ -"settings": { -    "index": { -        "analysis": { -            "analyzer": { -                "default": { -                    "type": "custom", -                    "tokenizer": "standard", -                    "filter": [ "lowercase", "asciifolding" ] -                }, -                "textIcu": { -                    "type": "custom", -                    "tokenizer": "icu_tokenizer", -                    "char_filter": [ "icu_normalizer" ], -                    "filter": [ "icu_folding" ] -                }, -                "textIcuSearch": { -                    "type": "custom", -                    "tokenizer": "icu_tokenizer", -                    "char_filter": [ "icu_normalizer" ], -                    "filter": [ "icu_folding" ] -                } -            } -        } -    } -}, -"mappings": { -    "release": { -        "properties": { -            "ident":          { "type": "keyword" }, -            "state":          { "type": "keyword" }, -            "revision":       { "type": "keyword" }, -            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "release_date":   { "type": "date" }, -            "release_year":   { "type": "integer" }, -            "release_type":   { "type": "keyword" }, -            "release_status": { "type": "keyword" }, -            "language":       { "type": "keyword" }, -            "doi":            { "type": "keyword" }, -            "pmid":           { "type": "keyword" }, -            "pmcid":          { "type": "keyword" }, -            "isbn13":         { "type": "keyword" }, -            "wikidata_qid":   { "type": "keyword" }, -            "core_id":        { "type": "keyword" }, -            "axiv_id":        { "type": "keyword" }, -            "jstor_id":       { "type": "keyword" }, -            "license":        { "type": "keyword" }, -            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "container_name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "container_issnl":          { "type": "keyword" }, -            "container_type":           { "type": "keyword" }, -            "contrib_count":        { "type": "integer" }, -            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, -            "ref_count":            { "type": "integer" }, -            "file_count":           { "type": "integer" }, -            "fileset_count":        { "type": "integer" }, -            "webcapture_count":     { "type": "integer" }, -            "any_abstract":         { "type": "boolean" }, - -            "best_pdf_url":         { "type": "keyword" }, -            "ia_pdf_url":           { "type": "keyword" }, -            "is_oa":                { "type": "boolean" }, -            "is_longtail_oa":       { "type": "boolean" }, -            "is_preserved":         { "type": "boolean" }, -            "in_kbart":             { "type": "boolean" }, -            "in_jstor":             { "type": "boolean" }, -            "in_dweb":              { "type": "boolean" }, -            "in_web":               { "type": "boolean" }, -            "in_ia":                { "type": "boolean" }, -            "in_ia_sim":            { "type": "boolean" }, -            "in_shadows":           { "type": "boolean" }, - -            "author":         { "type": "alias", "path": "contrib_names" }, -            "journal":        { "type": "alias", "path": "container_name" }, -            "date":           { "type": "alias", "path": "release_date" }, -            "year":           { "type": "alias", "path": "release_year" }, -            "issn":           { "type": "alias", "path": "container_issnl" }, -            "oa":             { "type": "alias", "path": "is_oa" }, -            "longtail":       { "type": "alias", "path": "is_longtail_oa" }, -            "lang":           { "type": "alias", "path": "language" }, -            "file_pdf_url":   { "type": "alias", "path": "best_pdf_url" }, -            "is_kept":        { "type": "alias", "path": "in_kbart" } -        } -    }, -    "changelog": { -        "properties": { -            "index":            { "type": "integer" }, -            "editgorup_id":     { "type": "keyword" }, -            "timestamp":        { "type": "date" }, -            "username":         { "type": "keyword" }, -            "is_bot":           { "type": "boolean" }, -            "is_admin":         { "type": "boolean" }, -            "agent":            { "type": "keyword" }, -            "containers":       { "type": "integer" }, -            "creators":         { "type": "integer" }, -            "files":            { "type": "integer" }, -            "filessets":        { "type": "integer" }, -            "webcaptures":      { "type": "integer" }, -            "releases":         { "type": "integer" }, -            "works":            { "type": "integer" }, -            "created":          { "type": "integer" }, -            "updated":          { "type": "integer" }, -            "deleted":          { "type": "integer" }, -            "total":            { "type": "integer" } -        } -    } -} -} diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json index 0fa25c3a..4635e469 100644 --- a/extra/elasticsearch/file_schema.json +++ b/extra/elasticsearch/file_schema.json @@ -1,6 +1,8 @@  {  "settings": {      "index": { +        "number_of_shards": 6, +        "number_of_replicas": 0,          "analysis": {              "analyzer": {                  "default": { @@ -25,7 +27,7 @@      }  },  "mappings": { -    "file": { +    "_doc": {          "properties": {              "ident":            { "type": "keyword", "normalizer": "default", "doc_values": false },              "state":            { "type": "keyword", "normalizer": "default" }, diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index e1f7a79a..91f2f183 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -2,6 +2,8 @@  "settings": {      "index": {          "analysis": { +        "number_of_shards": 6, +        "number_of_replicas": 0,              "analyzer": {                  "default": {                      "type": "custom", @@ -37,7 +39,7 @@      }  },  "mappings": { -    "release": { +    "_doc": {          "properties": {              "ident":          { "type": "keyword", "normalizer": "default", "doc_values": false },              "state":          { "type": "keyword", "normalizer": "default" },  | 
