aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/stats/2020-12-07-prod-stats.json1
-rw-r--r--extra/stats/2020-12-07-prod-tables-sizes.txt46
-rw-r--r--extra/stats/2020-12-23-prod-stats.json1
-rw-r--r--extra/stats/2020-12-23-prod-table-sizes.txt46
-rw-r--r--notes/bulk_edits/2020-12-14_doaj.md15
5 files changed, 109 insertions, 0 deletions
diff --git a/extra/stats/2020-12-07-prod-stats.json b/extra/stats/2020-12-07-prod-stats.json
new file mode 100644
index 00000000..0b3a4e25
--- /dev/null
+++ b/extra/stats/2020-12-07-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5021021,"timestamp":"2020-12-07T20:18:23.820391+00:00"}},"container":{"total":170246},"papers":{"in_kbart":71815549,"in_web":27500452,"in_web_not_kbart":12066730,"is_oa":17640091,"total":113444477},"release":{"refs_total":1014954200,"total":156561297}}
diff --git a/extra/stats/2020-12-07-prod-tables-sizes.txt b/extra/stats/2020-12-07-prod-tables-sizes.txt
new file mode 100644
index 00000000..618232e3
--- /dev/null
+++ b/extra/stats/2020-12-07-prod-tables-sizes.txt
@@ -0,0 +1,46 @@
+Size: 676.35G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 65 GB | 52 GB | 117 GB
+ "public"."release_rev" | 66 GB | 35 GB | 101 GB
+ "public"."refs_blob" | 93 GB | 2885 MB | 96 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 15 GB | 22 GB | 37 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 34 GB
+ "public"."file_rev_url" | 23 GB | 8437 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 25 GB
+ "public"."work_ident" | 10176 MB | 15 GB | 25 GB
+ "public"."abstracts" | 22 GB | 1832 MB | 23 GB
+ "public"."file_rev_release" | 7761 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5212 MB | 7393 MB | 12 GB
+ "public"."work_rev" | 6618 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3150 MB | 3988 MB | 7139 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1145 MB | 647 MB | 1792 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."changelog" | 352 MB | 302 MB | 654 MB
+ "public"."release_rev_extid" | 221 MB | 339 MB | 561 MB
+ "public"."container_rev" | 164 MB | 41 MB | 205 MB
+ "public"."container_edit" | 46 MB | 56 MB | 102 MB
+ "public"."container_ident" | 12 MB | 25 MB | 36 MB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB
+ "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/extra/stats/2020-12-23-prod-stats.json b/extra/stats/2020-12-23-prod-stats.json
new file mode 100644
index 00000000..bb27d708
--- /dev/null
+++ b/extra/stats/2020-12-23-prod-stats.json
@@ -0,0 +1 @@
+{"changelog":{"latest":{"index":5073319,"timestamp":"2020-12-23T19:01:04.942860+00:00"}},"container":{"total":171139},"papers":{"in_kbart":72019264,"in_web":27596752,"in_web_not_kbart":12138388,"is_oa":18312974,"total":114347662},"release":{"refs_total":1020596562,"total":158051479}}
diff --git a/extra/stats/2020-12-23-prod-table-sizes.txt b/extra/stats/2020-12-23-prod-table-sizes.txt
new file mode 100644
index 00000000..dacdb48c
--- /dev/null
+++ b/extra/stats/2020-12-23-prod-table-sizes.txt
@@ -0,0 +1,46 @@
+Size: 684.08G
+
+ table_name | table_size | indexes_size | total_size
+---------------------------------------+------------+--------------+------------
+ "public"."release_contrib" | 66 GB | 52 GB | 119 GB
+ "public"."release_rev" | 67 GB | 36 GB | 103 GB
+ "public"."refs_blob" | 94 GB | 2885 MB | 97 GB
+ "public"."file_rev" | 31 GB | 38 GB | 69 GB
+ "public"."release_edit" | 16 GB | 22 GB | 37 GB
+ "public"."work_edit" | 15 GB | 21 GB | 36 GB
+ "public"."file_edit" | 15 GB | 20 GB | 34 GB
+ "public"."file_rev_url" | 23 GB | 8456 MB | 32 GB
+ "public"."release_ident" | 10 GB | 15 GB | 26 GB
+ "public"."work_ident" | 10 GB | 15 GB | 25 GB
+ "public"."abstracts" | 23 GB | 1869 MB | 25 GB
+ "public"."file_rev_release" | 7769 MB | 13 GB | 21 GB
+ "public"."file_ident" | 7159 MB | 13 GB | 20 GB
+ "public"."release_ref" | 5277 MB | 7481 MB | 12 GB
+ "public"."work_rev" | 6724 MB | 5825 MB | 12 GB
+ "public"."release_rev_abstract" | 3306 MB | 4222 MB | 7529 MB
+ "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB
+ "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB
+ "public"."editgroup" | 1162 MB | 657 MB | 1820 MB
+ "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB
+ "public"."release_rev_extid" | 314 MB | 469 MB | 783 MB
+ "public"."changelog" | 360 MB | 309 MB | 669 MB
+ "public"."container_rev" | 164 MB | 42 MB | 206 MB
+ "public"."container_edit" | 46 MB | 56 MB | 103 MB
+ "public"."container_ident" | 12 MB | 25 MB | 36 MB
+ "public"."webcapture_rev_cdx" | 2616 kB | 568 kB | 3184 kB
+ "public"."webcapture_edit" | 240 kB | 192 kB | 432 kB
+ "public"."webcapture_rev_url" | 192 kB | 96 kB | 288 kB
+ "public"."webcapture_rev_release" | 80 kB | 136 kB | 216 kB
+ "public"."webcapture_ident" | 88 kB | 112 kB | 200 kB
+ "public"."webcapture_rev" | 144 kB | 56 kB | 200 kB
+ "public"."auth_oidc" | 48 kB | 48 kB | 96 kB
+ "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB
+ "public"."editor" | 48 kB | 48 kB | 96 kB
+ "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB
+ "public"."fileset_edit" | 16 kB | 48 kB | 64 kB
+ "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB
+ "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB
+ "public"."fileset_rev" | 16 kB | 16 kB | 32 kB
+ "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB
+(41 rows)
diff --git a/notes/bulk_edits/2020-12-14_doaj.md b/notes/bulk_edits/2020-12-14_doaj.md
index 64a80fda..5e897183 100644
--- a/notes/bulk_edits/2020-12-14_doaj.md
+++ b/notes/bulk_edits/2020-12-14_doaj.md
@@ -122,3 +122,18 @@ ahead with the full import; note that other ingest is happening in parallel
zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt -
# started 2020-12-17 22:01 (Pacific)
+
+ => 5.45M 52:38:45 [28.8 /s]
+ => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0})
+
+As total estimates:
+
+- total: 5,465,832
+- exists: 4,081,180
+- exists-fuzzy: 577,336
+- insert: 800,996
+
+Ending database size: Size: 684.08G
+
+(note that regular imports were running during same period)
+