diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-23 11:07:41 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2020-12-23 11:07:41 -0800 |
commit | d33a37eab50e95ceabadf7bbc20088ad62669564 (patch) | |
tree | 87f9e6d83808abebc7dd647dcff5fa3f7290f139 | |
parent | 3031aa414932b39f38a6456df2a6f55f0e72dfbe (diff) | |
download | fatcat-d33a37eab50e95ceabadf7bbc20088ad62669564.tar.gz fatcat-d33a37eab50e95ceabadf7bbc20088ad62669564.zip |
DOAJ import notes, and SQL/stats update
-rw-r--r-- | extra/stats/2020-12-07-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2020-12-07-prod-tables-sizes.txt | 46 | ||||
-rw-r--r-- | extra/stats/2020-12-23-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2020-12-23-prod-table-sizes.txt | 46 | ||||
-rw-r--r-- | notes/bulk_edits/2020-12-14_doaj.md | 15 |
5 files changed, 109 insertions, 0 deletions
diff --git a/extra/stats/2020-12-07-prod-stats.json b/extra/stats/2020-12-07-prod-stats.json new file mode 100644 index 00000000..0b3a4e25 --- /dev/null +++ b/extra/stats/2020-12-07-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5021021,"timestamp":"2020-12-07T20:18:23.820391+00:00"}},"container":{"total":170246},"papers":{"in_kbart":71815549,"in_web":27500452,"in_web_not_kbart":12066730,"is_oa":17640091,"total":113444477},"release":{"refs_total":1014954200,"total":156561297}} diff --git a/extra/stats/2020-12-07-prod-tables-sizes.txt b/extra/stats/2020-12-07-prod-tables-sizes.txt new file mode 100644 index 00000000..618232e3 --- /dev/null +++ b/extra/stats/2020-12-07-prod-tables-sizes.txt @@ -0,0 +1,46 @@ +Size: 676.35G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 65 GB | 52 GB | 117 GB + "public"."release_rev" | 66 GB | 35 GB | 101 GB + "public"."refs_blob" | 93 GB | 2885 MB | 96 GB + "public"."file_rev" | 31 GB | 38 GB | 69 GB + "public"."release_edit" | 15 GB | 22 GB | 37 GB + "public"."work_edit" | 15 GB | 21 GB | 36 GB + "public"."file_edit" | 15 GB | 20 GB | 34 GB + "public"."file_rev_url" | 23 GB | 8437 MB | 32 GB + "public"."release_ident" | 10 GB | 15 GB | 25 GB + "public"."work_ident" | 10176 MB | 15 GB | 25 GB + "public"."abstracts" | 22 GB | 1832 MB | 23 GB + "public"."file_rev_release" | 7761 MB | 13 GB | 21 GB + "public"."file_ident" | 7159 MB | 13 GB | 20 GB + "public"."release_ref" | 5212 MB | 7393 MB | 12 GB + "public"."work_rev" | 6618 MB | 5825 MB | 12 GB + "public"."release_rev_abstract" | 3150 MB | 3988 MB | 7139 MB + "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB + "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB + "public"."editgroup" | 1145 MB | 647 MB | 1792 MB + "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB + "public"."changelog" | 352 MB | 302 MB | 654 MB + "public"."release_rev_extid" | 221 MB | 339 MB | 561 MB + "public"."container_rev" | 164 MB | 41 MB | 205 MB + "public"."container_edit" | 46 MB | 56 MB | 102 MB + "public"."container_ident" | 12 MB | 25 MB | 36 MB + "public"."auth_oidc" | 48 kB | 48 kB | 96 kB + "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."editor" | 48 kB | 48 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB + "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/extra/stats/2020-12-23-prod-stats.json b/extra/stats/2020-12-23-prod-stats.json new file mode 100644 index 00000000..bb27d708 --- /dev/null +++ b/extra/stats/2020-12-23-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":5073319,"timestamp":"2020-12-23T19:01:04.942860+00:00"}},"container":{"total":171139},"papers":{"in_kbart":72019264,"in_web":27596752,"in_web_not_kbart":12138388,"is_oa":18312974,"total":114347662},"release":{"refs_total":1020596562,"total":158051479}} diff --git a/extra/stats/2020-12-23-prod-table-sizes.txt b/extra/stats/2020-12-23-prod-table-sizes.txt new file mode 100644 index 00000000..dacdb48c --- /dev/null +++ b/extra/stats/2020-12-23-prod-table-sizes.txt @@ -0,0 +1,46 @@ +Size: 684.08G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 66 GB | 52 GB | 119 GB + "public"."release_rev" | 67 GB | 36 GB | 103 GB + "public"."refs_blob" | 94 GB | 2885 MB | 97 GB + "public"."file_rev" | 31 GB | 38 GB | 69 GB + "public"."release_edit" | 16 GB | 22 GB | 37 GB + "public"."work_edit" | 15 GB | 21 GB | 36 GB + "public"."file_edit" | 15 GB | 20 GB | 34 GB + "public"."file_rev_url" | 23 GB | 8456 MB | 32 GB + "public"."release_ident" | 10 GB | 15 GB | 26 GB + "public"."work_ident" | 10 GB | 15 GB | 25 GB + "public"."abstracts" | 23 GB | 1869 MB | 25 GB + "public"."file_rev_release" | 7769 MB | 13 GB | 21 GB + "public"."file_ident" | 7159 MB | 13 GB | 20 GB + "public"."release_ref" | 5277 MB | 7481 MB | 12 GB + "public"."work_rev" | 6724 MB | 5825 MB | 12 GB + "public"."release_rev_abstract" | 3306 MB | 4222 MB | 7529 MB + "public"."creator_edit" | 935 MB | 1528 MB | 2463 MB + "public"."creator_rev" | 928 MB | 1242 MB | 2170 MB + "public"."editgroup" | 1162 MB | 657 MB | 1820 MB + "public"."creator_ident" | 631 MB | 1120 MB | 1751 MB + "public"."release_rev_extid" | 314 MB | 469 MB | 783 MB + "public"."changelog" | 360 MB | 309 MB | 669 MB + "public"."container_rev" | 164 MB | 42 MB | 206 MB + "public"."container_edit" | 46 MB | 56 MB | 103 MB + "public"."container_ident" | 12 MB | 25 MB | 36 MB + "public"."webcapture_rev_cdx" | 2616 kB | 568 kB | 3184 kB + "public"."webcapture_edit" | 240 kB | 192 kB | 432 kB + "public"."webcapture_rev_url" | 192 kB | 96 kB | 288 kB + "public"."webcapture_rev_release" | 80 kB | 136 kB | 216 kB + "public"."webcapture_ident" | 88 kB | 112 kB | 200 kB + "public"."webcapture_rev" | 144 kB | 56 kB | 200 kB + "public"."auth_oidc" | 48 kB | 48 kB | 96 kB + "public"."editgroup_annotation" | 48 kB | 48 kB | 96 kB + "public"."editor" | 48 kB | 48 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/notes/bulk_edits/2020-12-14_doaj.md b/notes/bulk_edits/2020-12-14_doaj.md index 64a80fda..5e897183 100644 --- a/notes/bulk_edits/2020-12-14_doaj.md +++ b/notes/bulk_edits/2020-12-14_doaj.md @@ -122,3 +122,18 @@ ahead with the full import; note that other ingest is happening in parallel zcat /srv/fatcat/datasets/doaj_article_data_2020-11-13_all.json.gz | shuf | pv -l | parallel -j12 --round-robin --pipe ./fatcat_import.py doaj-article --issn-map-file /srv/fatcat/datasets/ISSN-to-ISSN-L.txt - # started 2020-12-17 22:01 (Pacific) + + => 5.45M 52:38:45 [28.8 /s] + => Counter({'total': 1366458, 'exists': 1020295, 'insert': 200249, 'exists-fuzzy': 144334, 'skip': 1563, 'skip-title': 1563, 'skip-doaj-id-mismatch': 17, 'update': 0}) + +As total estimates: + +- total: 5,465,832 +- exists: 4,081,180 +- exists-fuzzy: 577,336 +- insert: 800,996 + +Ending database size: Size: 684.08G + +(note that regular imports were running during same period) + |