diff options
Diffstat (limited to 'extra')
-rw-r--r-- | extra/elasticsearch/README.md | 30 | ||||
-rw-r--r-- | extra/elasticsearch/release_schema.json | 2 | ||||
-rw-r--r-- | extra/stats/2022-01-12-prod-table-sizes.txt | 47 | ||||
-rw-r--r-- | extra/stats/2022-01-12-prod.stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2022-01-12-prod.v05.stats.json | 1 |
5 files changed, 65 insertions, 16 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 90019147..6d36c0fc 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -44,10 +44,10 @@ Drop and rebuild the schema: http delete :9200/fatcat_file http delete :9200/fatcat_changelog http delete :9200/fatcat_ref - http put :9200/fatcat_release_v03c?include_type_name=true < release_schema.json - http put :9200/fatcat_container_v03c?include_type_name=true < container_schema.json - http put :9200/fatcat_file_v03c?include_type_name=true < file_schema.json - http put :9200/fatcat_changelog_v03c?include_type_name=true < changelog_schema.json + http put :9200/fatcat_release_v05?include_type_name=true < release_schema.json + http put :9200/fatcat_container_v05?include_type_name=true < container_schema.json + http put :9200/fatcat_file_v05?include_type_name=true < file_schema.json + http put :9200/fatcat_changelog_v05?include_type_name=true < changelog_schema.json http put :9200/fatcat_ref?include_type_name=true < ref_schema.json Put a single object (good for debugging): @@ -63,14 +63,14 @@ Or, in a bulk production bootstrap indexing (NOTE: `--tmpdir` is important for large indexes with small rootfs partitions): export LC_ALL=C.UTF-8 - time zcat /srv/fatcat/snapshots/2021-03-08/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container_v03c - time zcat /srv/fatcat/snapshots/2021-03-08/release_export_expanded.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release_v03c - time zcat /srv/fatcat/snapshots/2021-03-08/file_export.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file_v03c + time zcat /srv/fatcat/snapshots/2021-03-08/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container_v05 + time zcat /srv/fatcat/snapshots/2021-03-08/release_export_expanded.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release_v05 + time zcat /srv/fatcat/snapshots/2021-03-08/file_export.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file_v05 - http put :9200/fatcat_release_v03c/_alias/fatcat_release - http put :9200/fatcat_container_v03c/_alias/fatcat_container - http put :9200/fatcat_file_v03c/_alias/fatcat_file - http put :9200/fatcat_changelog_v03c/_alias/fatcat_changelog + http put :9200/fatcat_release_v05/_alias/fatcat_release + http put :9200/fatcat_container_v05/_alias/fatcat_container + http put :9200/fatcat_file_v05/_alias/fatcat_file + http put :9200/fatcat_changelog_v05/_alias/fatcat_changelog As of April 2021, the release indexing process takes about 6 hours. @@ -81,21 +81,21 @@ time-stamped) elasticsearch indexes, and then point to them using index aliases. The index alias updates are fast and atomic, so we can slowly build up a new index and then cut over with no downtime. - http put :9200/fatcat_release_v03 < release_schema.json + http put :9200/fatcat_release_v05 < release_schema.json To replace a "real" index with an alias pointer, do two actions (not truly zero-downtime, but pretty fast): http delete :9200/fatcat_release - http put :9200/fatcat_release_v03/_alias/fatcat_release + http put :9200/fatcat_release_v05/_alias/fatcat_release To do an atomic swap from one alias to a new one ("zero downtime"): http post :9200/_aliases << EOF { "actions": [ - { "remove": { "index": "fatcat_release_v03", "alias": "fatcat_release" }}, - { "add": { "index": "fatcat_release_v04", "alias": "fatcat_release" }} + { "remove": { "index": "fatcat_release_v05", "alias": "fatcat_release" }}, + { "add": { "index": "fatcat_release_v06", "alias": "fatcat_release" }} ] } EOF diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json index 9dae9312..fa466168 100644 --- a/extra/elasticsearch/release_schema.json +++ b/extra/elasticsearch/release_schema.json @@ -133,7 +133,7 @@ "journal": { "type": "alias", "path": "container_name" }, "date": { "type": "alias", "path": "release_date" }, "year": { "type": "alias", "path": "release_year" }, - "issn": { "type": "alias", "path": "container_issn" }, + "issn": { "type": "alias", "path": "container_issns" }, "oa": { "type": "alias", "path": "is_oa" }, "longtail": { "type": "alias", "path": "is_longtail_oa" }, "lang": { "type": "alias", "path": "language" }, diff --git a/extra/stats/2022-01-12-prod-table-sizes.txt b/extra/stats/2022-01-12-prod-table-sizes.txt new file mode 100644 index 00000000..3d640a3d --- /dev/null +++ b/extra/stats/2022-01-12-prod-table-sizes.txt @@ -0,0 +1,47 @@ +Size: 795.34G +PostgreSQL 11.6, wbgrp-svc502.us.archive.org + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 81 GB | 63 GB | 144 GB + "public"."release_rev" | 78 GB | 39 GB | 118 GB + "public"."refs_blob" | 111 GB | 2885 MB | 114 GB + "public"."file_rev" | 35 GB | 44 GB | 79 GB + "public"."release_edit" | 18 GB | 23 GB | 41 GB + "public"."file_rev_url" | 29 GB | 11 GB | 40 GB + "public"."file_edit" | 17 GB | 22 GB | 39 GB + "public"."work_edit" | 17 GB | 22 GB | 39 GB + "public"."abstracts" | 30 GB | 2355 MB | 32 GB + "public"."release_ident" | 12 GB | 16 GB | 27 GB + "public"."work_ident" | 11 GB | 15 GB | 27 GB + "public"."file_rev_release" | 8749 MB | 16 GB | 24 GB + "public"."file_ident" | 7997 MB | 13 GB | 21 GB + "public"."release_ref" | 6349 MB | 8982 MB | 15 GB + "public"."work_rev" | 7477 MB | 5829 MB | 13 GB + "public"."release_rev_abstract" | 4441 MB | 6056 MB | 10 GB + "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB + "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB + "public"."editgroup" | 1370 MB | 762 MB | 2132 MB + "public"."webcapture_rev_cdx" | 1525 MB | 290 MB | 1815 MB + "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB + "public"."release_rev_extid" | 523 MB | 772 MB | 1294 MB + "public"."changelog" | 450 MB | 382 MB | 832 MB + "public"."container_rev" | 206 MB | 82 MB | 288 MB + "public"."container_edit" | 57 MB | 79 MB | 137 MB + "public"."container_ident" | 13 MB | 31 MB | 44 MB + "public"."webcapture_edit" | 25 MB | 17 MB | 41 MB + "public"."webcapture_rev_url" | 21 MB | 7064 kB | 27 MB + "public"."webcapture_rev_release" | 7368 kB | 12 MB | 19 MB + "public"."webcapture_rev" | 14 MB | 4576 kB | 19 MB + "public"."webcapture_ident" | 8272 kB | 9064 kB | 17 MB + "public"."editor" | 88 kB | 136 kB | 224 kB + "public"."auth_oidc" | 88 kB | 128 kB | 216 kB + "public"."editgroup_annotation" | 64 kB | 48 kB | 112 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/extra/stats/2022-01-12-prod.stats.json b/extra/stats/2022-01-12-prod.stats.json new file mode 100644 index 00000000..6920a477 --- /dev/null +++ b/extra/stats/2022-01-12-prod.stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5705601,"timestamp":"2022-01-13T01:04:53.709983+00:00"}},"container":{"total":188756},"papers":{"in_kbart":75551430,"in_web":32110002,"in_web_not_kbart":15551322,"is_oa":21898276,"total":123871871},"release":{"refs_total":1230165938,"total":175564877}} diff --git a/extra/stats/2022-01-12-prod.v05.stats.json b/extra/stats/2022-01-12-prod.v05.stats.json new file mode 100644 index 00000000..275730b0 --- /dev/null +++ b/extra/stats/2022-01-12-prod.v05.stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5705899,"timestamp":"2022-01-13T02:28:48.619854+00:00"}},"container":{"total":186415},"papers":{"in_kbart":76702681,"in_web":32009914,"in_web_not_kbart":15065659,"is_oa":22612181,"total":123885490},"release":{"refs_total":1230468623,"total":175595974}} |