diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:04:54 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-24 18:04:54 -0700 | 
| commit | ed78736b5d96a294c89b201c1d992bd30d809434 (patch) | |
| tree | b860457b9b1d09b8ecf52e512dddfcfd3e3e60ed | |
| parent | 94831f130e87286ad7394ec10f8e61d7ecd72adf (diff) | |
| download | fatcat-ed78736b5d96a294c89b201c1d992bd30d809434.tar.gz fatcat-ed78736b5d96a294c89b201c1d992bd30d809434.zip | |
updated import timing
| -rw-r--r-- | notes/import_timing_20180920.txt | 30 | ||||
| -rw-r--r-- | notes/import_timing_20180923.txt | 39 | 
2 files changed, 68 insertions, 1 deletions
| diff --git a/notes/import_timing_20180920.txt b/notes/import_timing_20180920.txt index 3bcf2a57..a57ffd77 100644 --- a/notes/import_timing_20180920.txt +++ b/notes/import_timing_20180920.txt @@ -61,8 +61,36 @@ Table sizes at this point:   "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB   "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB + +       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan  +----------------------+--------------+------+--------------+----------+---------- + release_edit         |            0 | OK   |   6993084416 |        0 |        0 + container_rev        |            0 | OK   |     54124544 |        0 |        0 + creator_ident        |            0 | OK   |    266919936 |        0 |        0 + creator_edit         |            0 | OK   |    363921408 |        0 |        0 + work_rev             |            0 | OK   |   3327262720 |        0 |        0 + creator_rev          |            0 | OK   |    388726784 |        0 |        0 + work_ident           |            0 | OK   |   5128560640 |        0 |        0 + work_edit            |            0 | OK   |   6993108992 |        0 |        0 + container_ident      |            0 | OK   |     25092096 |        0 |        0 + file_edit            |            0 | OK   |    598278144 |        0 |        0 + container_edit       |            0 | OK   |     33857536 |        0 |        0 + changelog            |            0 | OK   |    127549440 |        0 |        0 + abstracts            |        -4714 | OK   |   1706713088 |        0 |     4714 + file_release         |       -13583 | OK   |    401752064 |        0 |    13583 + file_rev_url         |       -13583 | OK   |   3832389632 |        0 |    13583 + editgroup            |       -74109 | OK   |    144277504 |        0 |    74109 + release_contrib      |       -76699 | OK   |  20357849088 |        0 |    76699 + release_ref          |       -76700 | OK   | 183009157120 |        3 |    76703 + release_rev_abstract |       -76939 | OK   |    192102400 |        0 |    76939 + release_rev          |       -77965 | OK   |  47602647040 |        0 |    77965 + file_ident           |      -100089 | OK   |    438255616 |        3 |   100092 + release_ident        |      -152809 | OK   |   5128617984 |        0 |   152809 + file_rev             |      -440780 | OK   |    837705728 |        0 |   440780 +(23 rows) +  Continuing imports:      zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - -      => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} + diff --git a/notes/import_timing_20180923.txt b/notes/import_timing_20180923.txt new file mode 100644 index 00000000..f8814f3d --- /dev/null +++ b/notes/import_timing_20180923.txt @@ -0,0 +1,39 @@ + +    105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k +    71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps + +    real    959m39.521s +    user    1845m10.392s +    sys     70m33.780s + +Did I get the same error again? I'm confused: + +    HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} +    (but not in all threads) + +Yes, ugh, because 50*2500 can be over (it's not just individual large releases, +they come in big batches). + +But: + +    select count(id) from release_ident; => 70006121 + +A lot, though not 72 million like last time, hrm. I'm... going to move ahead I +guess. + +"Processed 4440850 lines, inserted 3509600, updated 0." +    => implies 79029915 records + +    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - +    Processed 530750 lines, inserted 435239, updated 0. (etc) +    Command exited with non-zero status 1 +    15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k +    127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps + +    real    143m52.681s +    user    252m31.620s +    sys     11m21.608s + +    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + +    (running...) | 
