aboutsummaryrefslogtreecommitdiffstats
path: root/extra
diff options
context:
space:
mode:
Diffstat (limited to 'extra')
-rw-r--r--extra/elasticsearch/README.md30
-rw-r--r--extra/elasticsearch/release_schema.json2
-rw-r--r--extra/stats/2022-01-12-prod-table-sizes.txt47
-rw-r--r--extra/stats/2022-01-12-prod.stats.json1
-rw-r--r--extra/stats/2022-01-12-prod.v05.stats.json1
5 files changed, 65 insertions, 16 deletions
diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md
index 90019147..6d36c0fc 100644
--- a/extra/elasticsearch/README.md
+++ b/extra/elasticsearch/README.md
@@ -44,10 +44,10 @@ Drop and rebuild the schema:
http delete :9200/fatcat_file
http delete :9200/fatcat_changelog
http delete :9200/fatcat_ref
- http put :9200/fatcat_release_v03c?include_type_name=true < release_schema.json
- http put :9200/fatcat_container_v03c?include_type_name=true < container_schema.json
- http put :9200/fatcat_file_v03c?include_type_name=true < file_schema.json
- http put :9200/fatcat_changelog_v03c?include_type_name=true < changelog_schema.json
+ http put :9200/fatcat_release_v05?include_type_name=true < release_schema.json
+ http put :9200/fatcat_container_v05?include_type_name=true < container_schema.json
+ http put :9200/fatcat_file_v05?include_type_name=true < file_schema.json
+ http put :9200/fatcat_changelog_v05?include_type_name=true < changelog_schema.json
http put :9200/fatcat_ref?include_type_name=true < ref_schema.json
Put a single object (good for debugging):
@@ -63,14 +63,14 @@ Or, in a bulk production bootstrap indexing (NOTE: `--tmpdir` is important for
large indexes with small rootfs partitions):
export LC_ALL=C.UTF-8
- time zcat /srv/fatcat/snapshots/2021-03-08/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container_v03c
- time zcat /srv/fatcat/snapshots/2021-03-08/release_export_expanded.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release_v03c
- time zcat /srv/fatcat/snapshots/2021-03-08/file_export.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file_v03c
+ time zcat /srv/fatcat/snapshots/2021-03-08/container_export.json.gz | pv -l | ./fatcat_transform.py elasticsearch-containers - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_container_v05
+ time zcat /srv/fatcat/snapshots/2021-03-08/release_export_expanded.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-releases - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_release_v05
+ time zcat /srv/fatcat/snapshots/2021-03-08/file_export.json.gz | pv -l | parallel --tmpdir /1/tmp -j20 --linebuffer --round-robin --pipe ./fatcat_transform.py elasticsearch-files - - | esbulk -verbose -size 1000 -id ident -w 8 -index fatcat_file_v05
- http put :9200/fatcat_release_v03c/_alias/fatcat_release
- http put :9200/fatcat_container_v03c/_alias/fatcat_container
- http put :9200/fatcat_file_v03c/_alias/fatcat_file
- http put :9200/fatcat_changelog_v03c/_alias/fatcat_changelog
+ http put :9200/fatcat_release_v05/_alias/fatcat_release
+ http put :9200/fatcat_container_v05/_alias/fatcat_container
+ http put :9200/fatcat_file_v05/_alias/fatcat_file
+ http put :9200/fatcat_changelog_v05/_alias/fatcat_changelog
As of April 2021, the release indexing process takes about 6 hours.
@@ -81,21 +81,21 @@ time-stamped) elasticsearch indexes, and then point to them using index
aliases. The index alias updates are fast and atomic, so we can slowly build up
a new index and then cut over with no downtime.
- http put :9200/fatcat_release_v03 < release_schema.json
+ http put :9200/fatcat_release_v05 < release_schema.json
To replace a "real" index with an alias pointer, do two actions (not truly
zero-downtime, but pretty fast):
http delete :9200/fatcat_release
- http put :9200/fatcat_release_v03/_alias/fatcat_release
+ http put :9200/fatcat_release_v05/_alias/fatcat_release
To do an atomic swap from one alias to a new one ("zero downtime"):
http post :9200/_aliases << EOF
{
"actions": [
- { "remove": { "index": "fatcat_release_v03", "alias": "fatcat_release" }},
- { "add": { "index": "fatcat_release_v04", "alias": "fatcat_release" }}
+ { "remove": { "index": "fatcat_release_v05", "alias": "fatcat_release" }},
+ { "add": { "index": "fatcat_release_v06", "alias": "fatcat_release" }}
]
}
EOF
diff --git a/extra/elasticsearch/release_schema.json b/extra/elasticsearch/release_schema.json
index 9dae9312..fa466168 100644
--- a/extra/elasticsearch/release_schema.json
+++ b/extra/elasticsearch/release_schema.json
@@ -133,7 +133,7 @@
"journal": { "type": "alias", "path": "container_name" },
"date": { "type": "alias", "path": "release_date" },
"year": { "type": "alias", "path": "release_year" },
- "issn": { "type": "alias", "path": "container_issn" },
+ "issn": { "type": "alias", "path": "container_issns" },
"oa": { "type": "alias", "path": "is_oa" },
"longtail": { "type": "alias", "path": "is_longtail_oa" },
"lang": { "type": "alias", "path": "language" },
diff --git a/extra/stats/2022-01-12-prod-table-sizes.txt b/extra/stats/2022-01-12-prod-table-sizes.txt
new file mode 100644
index 00000000..3d640a3d
--- /dev/null
+++ b/extra/stats/2022-01-12-prod-table-sizes.txt
@@ -0,0 +1,47 @@
+Size: 795.34G
+PostgreSQL 11.6, wbgrp-svc502.us.archive.org
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 81 GB | 63 GB | 144 GB
+ "public"."release_rev" | 78 GB | 39 GB | 118 GB
+ "public"."refs_blob" | 111 GB | 2885 MB | 114 GB
+ "public"."file_rev" | 35 GB | 44 GB | 79 GB
+ "public"."release_edit" | 18 GB | 23 GB | 41 GB
+ "public"."file_rev_url" | 29 GB | 11 GB | 40 GB
+ "public"."file_edit" | 17 GB | 22 GB | 39 GB
+ "public"."work_edit" | 17 GB | 22 GB | 39 GB
+ "public"."abstracts" | 30 GB | 2355 MB | 32 GB
+ "public"."release_ident" | 12 GB | 16 GB | 27 GB
+ "public"."work_ident" | 11 GB | 15 GB | 27 GB
+ "public"."file_rev_release" | 8749 MB | 16 GB | 24 GB
+ "public"."file_ident" | 7997 MB | 13 GB | 21 GB
+ "public"."release_ref" | 6349 MB | 8982 MB | 15 GB
+ "public"."work_rev" | 7477 MB | 5829 MB | 13 GB
+ "public"."release_rev_abstract" | 4441 MB | 6056 MB | 10 GB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1370 MB | 762 MB | 2132 MB
+ "public"."webcapture_rev_cdx" | 1525 MB | 290 MB | 1815 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."release_rev_extid" | 523 MB | 772 MB | 1294 MB
+ "public"."changelog" | 450 MB | 382 MB | 832 MB
+ "public"."container_rev" | 206 MB | 82 MB | 288 MB
+ "public"."container_edit" | 57 MB | 79 MB | 137 MB
+ "public"."container_ident" | 13 MB | 31 MB | 44 MB
+ "public"."webcapture_edit" | 25 MB | 17 MB | 41 MB
+ "public"."webcapture_rev_url" | 21 MB | 7064 kB | 27 MB
+ "public"."webcapture_rev_release" | 7368 kB | 12 MB | 19 MB
+ "public"."webcapture_rev" | 14 MB | 4576 kB | 19 MB
+ "public"."webcapture_ident" | 8272 kB | 9064 kB | 17 MB
+ "public"."editor" | 88 kB | 136 kB | 224 kB
+ "public"."auth_oidc" | 88 kB | 128 kB | 216 kB
+ "public"."editgroup_annotation" | 64 kB | 48 kB | 112 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/extra/stats/2022-01-12-prod.stats.json b/extra/stats/2022-01-12-prod.stats.json
new file mode 100644
index 00000000..6920a477
--- /dev/null
+++ b/extra/stats/2022-01-12-prod.stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5705601,"timestamp":"2022-01-13T01:04:53.709983+00:00"}},"container":{"total":188756},"papers":{"in_kbart":75551430,"in_web":32110002,"in_web_not_kbart":15551322,"is_oa":21898276,"total":123871871},"release":{"refs_total":1230165938,"total":175564877}}
diff --git a/extra/stats/2022-01-12-prod.v05.stats.json b/extra/stats/2022-01-12-prod.v05.stats.json
new file mode 100644
index 00000000..275730b0
--- /dev/null
+++ b/extra/stats/2022-01-12-prod.v05.stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5705899,"timestamp":"2022-01-13T02:28:48.619854+00:00"}},"container":{"total":186415},"papers":{"in_kbart":76702681,"in_web":32009914,"in_web_not_kbart":15065659,"is_oa":22612181,"total":123885490},"release":{"refs_total":1230468623,"total":175595974}}