diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:04:54 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:04:54 -0700 |
commit | ed78736b5d96a294c89b201c1d992bd30d809434 (patch) | |
tree | b860457b9b1d09b8ecf52e512dddfcfd3e3e60ed | |
parent | 94831f130e87286ad7394ec10f8e61d7ecd72adf (diff) | |
download | fatcat-ed78736b5d96a294c89b201c1d992bd30d809434.tar.gz fatcat-ed78736b5d96a294c89b201c1d992bd30d809434.zip |
updated import timing
-rw-r--r-- | notes/import_timing_20180920.txt | 30 | ||||
-rw-r--r-- | notes/import_timing_20180923.txt | 39 |
2 files changed, 68 insertions, 1 deletions
diff --git a/notes/import_timing_20180920.txt b/notes/import_timing_20180920.txt index 3bcf2a57..a57ffd77 100644 --- a/notes/import_timing_20180920.txt +++ b/notes/import_timing_20180920.txt @@ -61,8 +61,36 @@ Table sizes at this point: "public"."container_edit" | 32 MB | 30 MB | 62 MB "public"."container_ident" | 24 MB | 28 MB | 52 MB + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+------+--------------+----------+---------- + release_edit | 0 | OK | 6993084416 | 0 | 0 + container_rev | 0 | OK | 54124544 | 0 | 0 + creator_ident | 0 | OK | 266919936 | 0 | 0 + creator_edit | 0 | OK | 363921408 | 0 | 0 + work_rev | 0 | OK | 3327262720 | 0 | 0 + creator_rev | 0 | OK | 388726784 | 0 | 0 + work_ident | 0 | OK | 5128560640 | 0 | 0 + work_edit | 0 | OK | 6993108992 | 0 | 0 + container_ident | 0 | OK | 25092096 | 0 | 0 + file_edit | 0 | OK | 598278144 | 0 | 0 + container_edit | 0 | OK | 33857536 | 0 | 0 + changelog | 0 | OK | 127549440 | 0 | 0 + abstracts | -4714 | OK | 1706713088 | 0 | 4714 + file_release | -13583 | OK | 401752064 | 0 | 13583 + file_rev_url | -13583 | OK | 3832389632 | 0 | 13583 + editgroup | -74109 | OK | 144277504 | 0 | 74109 + release_contrib | -76699 | OK | 20357849088 | 0 | 76699 + release_ref | -76700 | OK | 183009157120 | 3 | 76703 + release_rev_abstract | -76939 | OK | 192102400 | 0 | 76939 + release_rev | -77965 | OK | 47602647040 | 0 | 77965 + file_ident | -100089 | OK | 438255616 | 3 | 100092 + release_ident | -152809 | OK | 5128617984 | 0 | 152809 + file_rev | -440780 | OK | 837705728 | 0 | 440780 +(23 rows) + Continuing imports: zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - - => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} + diff --git a/notes/import_timing_20180923.txt b/notes/import_timing_20180923.txt new file mode 100644 index 00000000..f8814f3d --- /dev/null +++ b/notes/import_timing_20180923.txt @@ -0,0 +1,39 @@ + + 105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k + 71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps + + real 959m39.521s + user 1845m10.392s + sys 70m33.780s + +Did I get the same error again? I'm confused: + + HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} + (but not in all threads) + +Yes, ugh, because 50*2500 can be over (it's not just individual large releases, +they come in big batches). + +But: + + select count(id) from release_ident; => 70006121 + +A lot, though not 72 million like last time, hrm. I'm... going to move ahead I +guess. + +"Processed 4440850 lines, inserted 3509600, updated 0." + => implies 79029915 records + + time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - + Processed 530750 lines, inserted 435239, updated 0. (etc) + Command exited with non-zero status 1 + 15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k + 127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps + + real 143m52.681s + user 252m31.620s + sys 11m21.608s + + zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + + (running...) |