diff options
-rw-r--r-- | notes/es-indexing.md | 38 |
1 files changed, 38 insertions, 0 deletions
diff --git a/notes/es-indexing.md b/notes/es-indexing.md index 04fca2e..61c192e 100644 --- a/notes/es-indexing.md +++ b/notes/es-indexing.md @@ -2,6 +2,44 @@ Going to do some initial indexing of refs ("BiblioRefs" schema) into elasticsearch 7 cluster. +## 2021-07-16 + +* generated "bref" dataset on aitio, + "/magna/refcat/2021-07-06/BrefCombined/date-2021-07-06.json.zst", + d69838fb71623a83b60e03be3493042b27539567 +* 1,865,637,767 docs, about 40% increase since last version + +The index name will be: `fatcat_ref_v02_20210716` + + http put :9200/fatcat_ref_v02_20210716 < /srv/fatcat/src/extra/elasticsearch/ref_schema.json + +Single shard: + + http put ":9200/fatcat_ref_v02_20210716/_settings" index.routing.allocation.include._name=wbgrp-svc500 + +Confirm: + + $ http get :9200/_cat/shards/fatcat_ref_v02_20210716 + HTTP/1.1 200 OK + content-encoding: gzip + content-length: 117 + content-type: text/plain; charset=UTF-8 + + fatcat_ref_v02_20210716 3 p STARTED 0 208b 207.241.225.228 wbgrp-svc500 + fatcat_ref_v02_20210716 1 p STARTED 0 208b 207.241.225.228 wbgrp-svc500 + fatcat_ref_v02_20210716 2 p STARTED 0 208b 207.241.225.228 wbgrp-svc500 + fatcat_ref_v02_20210716 4 p STARTED 0 208b 207.241.225.228 wbgrp-svc500 + fatcat_ref_v02_20210716 5 p STARTED 0 208b 207.241.225.228 wbgrp-svc500 + fatcat_ref_v02_20210716 0 p STARTED 0 208b 207.241.225.228 wbgrp-svc500 + +Expecting 1,865,637,767 edges, as we improved deduplication since v01. + + zstdcat -T0 /srv/fatcat/datasets/fatcat_refs.date-2021-07-06.json.zst | esbulk -verbose -size 2000 -id _id -w 8 -index fatcat_ref_v02_20210716 + +Watch indexing: + + watch -n 10 'curl -s localhost:9200/_cat/indices | grep fatcat_ref_v02_20210716' + ## 2021-04-12 Reduced `number_of_shards` from 12 to 6. |