diff options
authorBryan Newbold <bnewbold@robocracy.org>2020-12-23 11:07:41 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-12-23 11:07:41 -0800
commitd33a37eab50e95ceabadf7bbc20088ad62669564 (patch)
parent3031aa414932b39f38a6456df2a6f55f0e72dfbe (diff)
DOAJ import notes, and SQL/stats update
5 files changed, 109 insertions, 0 deletions
diff --git a/extra/stats/2020-12-07-prod-stats.json b/extra/stats/2020-12-07-prod-stats.json
new file mode 100644
index 00000000..0b3a4e25
--- /dev/null
+++ b/extra/stats/2020-12-07-prod-stats.json
@@ -0,0 +1 @@
diff --git a/extra/stats/2020-12-07-prod-tables-sizes.txt b/extra/stats/2020-12-07-prod-tables-sizes.txt
new file mode 100644
index 00000000..618232e3
--- /dev/null
+++ b/extra/stats/2020-12-07-prod-tables-sizes.txt
@@ -0,0 +1,46 @@
+Size: 676.35G
+ table_name | table_size | indexes_size | total_size
+ "public"."release_contrib" | 65 GB | 52 GB | 117 GB
+ "public"."release_rev" | 66 GB | 35 GB | 101 GB
+ "public"."refs_blob" | 93 GB | 2885 MB | 96 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 15 GB | 22 GB | 37 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 34 GB
+ "public"."file_rev_url" | 23 GB | 8437 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 25 GB
+ "public"."work_ident" | 10176 MB | 15 GB | 25 GB
+ "public"."abstracts" | 22 GB | 1832 MB | 23 GB
+ "public"."file_rev_release" | 7761 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5212 MB | 7393 MB | 12 GB
+ "public"."work_rev" | 6618 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3150 MB | 3988 MB | 7139 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1145 MB | 647 MB | 1792 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."changelog" | 352 MB | 302 MB | 654 MB
+ "public"."release_rev_extid" | 221 MB | 339 MB | 561 MB
+ "public"."container_rev" | 164 MB | 41 MB | 205 MB
+ "public"."container_edit" | 46 MB | 56 MB | 102 MB
+ "public"."container_ident" | 12 MB | 25 MB | 36 MB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/extra/stats/2020-12-23-prod-stats.json b/extra/stats/2020-12-23-prod-stats.json
new file mode 100644
index 00000000..bb27d708
--- /dev/null
+++ b/extra/stats/2020-12-23-prod-stats.json
@@ -0,0 +1 @@
diff --git a/extra/stats/2020-12-23-prod-table-sizes.txt b/extra/stats/2020-12-23-prod-table-sizes.txt
new file mode 100644
index 00000000..dacdb48c
--- /dev/null
+++ b/extra/stats/2020-12-23-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 684.08G
+ table_name | table_size | indexes_size | total_size
+ "public"."release_contrib" | 66 GB | 52 GB | 119 GB
+ "public"."release_rev" | 67 GB | 36 GB | 103 GB
+ "public"."refs_blob" | 94 GB | 2885 MB | 97 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 16 GB | 22 GB | 37 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 34 GB
+ "public"."file_rev_url" | 23 GB | 8456 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 26 GB
+ "public"."work_ident" | 10 GB | 15 GB | 25 GB
+ "public"."abstracts" | 23 GB | 1869 MB | 25 GB
+ "public"."file_rev_release" | 7769 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5277 MB | 7481 MB | 12 GB
+ "public"."work_rev" | 6724 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3306 MB | 4222 MB | 7529 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1162 MB | 657 MB | 1820 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."release_rev_extid" | 314 MB | 469 MB | 783 MB
+ "public"."changelog" | 360 MB | 309 MB | 669 MB
+ "public"."container_rev" | 164 MB | 42 MB | 206 MB
+ "public"."container_edit" | 46 MB | 56 MB | 103 MB
+ "public"."container_ident" | 12 MB | 25 MB | 36 MB
+ "public"."webcapture_rev_cdx" | 2616 kB | 568 kB | 3184 kB
+ "public"."webcapture_edit" | 240 kB | 192 kB | 432 kB
+ "public"."webcapture_rev_url" | 192 kB | 96 kB | 288 kB
+ "public"."webcapture_rev_release" | 80 kB | 136 kB | 216 kB
+ "public"."webcapture_ident" | 88 kB | 112 kB | 200 kB
+ "public"."webcapture_rev" | 144 kB | 56 kB | 200 kB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/notes/bulk_edits/2020-12-14_doaj.md b/notes/bulk_edits/2020-12-14_doaj.md
index 64a80fda..5e897183 100644
--- a/notes/bulk_edits/2020-12-14_doaj.md
+++ b/notes/bulk_edits/2020-12-14_doaj.md
@@ -122,3 +122,18 @@ ahead with the full import; note that other ingest is happening in parallel
zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
# started 2020-12-17 22:01 (Pacific)
+ => 5.45M 52:38:45 [28.8 /s]
+ => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0})
+As total estimates:
+- total: 5,465,832
+- exists: 4,081,180
+- exists-fuzzy: 577,336
+- insert: 800,996
+Ending database size: Size: 684.08G
+(note that regular imports were running during same period)