diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-22 19:08:18 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-22 19:08:18 -0700 | 
| commit | 91eb3a7a9e7fdb1b344462d5bfb3e826320dc431 (patch) | |
| tree | aa56b19199df44e91eb4193711a9d39d5ef7dc73 /notes | |
| parent | b12158b396bd849f40ff6713ad7836f3293f4029 (diff) | |
| download | fatcat-91eb3a7a9e7fdb1b344462d5bfb3e826320dc431.tar.gz fatcat-91eb3a7a9e7fdb1b344462d5bfb3e826320dc431.zip | |
commit old notes and other files
Diffstat (limited to 'notes')
| -rw-r--r-- | notes/import_timing_20180910.txt | 1 | ||||
| -rw-r--r-- | notes/import_timing_20180920.txt | 68 | ||||
| -rw-r--r-- | notes/old_imports.txt | 20 | 
3 files changed, 89 insertions, 0 deletions
| diff --git a/notes/import_timing_20180910.txt b/notes/import_timing_20180910.txt index c9f18548..43c76e43 100644 --- a/notes/import_timing_20180910.txt +++ b/notes/import_timing_20180910.txt @@ -122,6 +122,7 @@ Dumps!      => 101k 0:00:34 [2.99k/s]      => estimate 6.5 hours +    # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers      cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz      => TPS: 29605 (!)      => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?) diff --git a/notes/import_timing_20180920.txt b/notes/import_timing_20180920.txt new file mode 100644 index 00000000..3bcf2a57 --- /dev/null +++ b/notes/import_timing_20180920.txt @@ -0,0 +1,68 @@ + +This was on fatcat-prod-vm (2TB disk). + +    time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv + +    Processed 53300 lines, inserted 53283, updated 0. +    real    0m32.463s +    user    0m8.716s +    sys     0m0.284s + +    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +    Processed 48900 lines, inserted 48731, updated 0. <= these numbers times 80x +    100% 80:0=0s + +    real    10m20.598s +    user    26m16.544s +    sys     1m40.284s + +    time xzcat /srv/fatcat/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + +    Processed 4679900 lines, inserted 3755867, updated 0. +    107730.08user 4110.22system 16:31:25elapsed 188%CPU (0avgtext+0avgdata 447496maxresident)k +    77644160inputs+361948352outputs (105major+49094767minor)pagefaults 0swaps + +    => 16.5 hours, faster! + +    select count(id) from release_ident; => 75106713 + +    kernel/system crashed after first file import (!), so don't have numbers from that. + +Table sizes at this point: + +    select count(id) from file_ident; => 6334606 + +    Size:  389.25G + +                          table_name                          | table_size | indexes_size | total_size  +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref"                                       | 170 GB     | 47 GB        | 217 GB + "public"."release_rev"                                       | 44 GB      | 21 GB        | 65 GB + "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB + "public"."release_edit"                                      | 6671 MB    | 6505 MB      | 13 GB + "public"."work_edit"                                         | 6671 MB    | 6505 MB      | 13 GB + "public"."release_ident"                                     | 4892 MB    | 5875 MB      | 11 GB + "public"."work_ident"                                        | 4892 MB    | 5874 MB      | 11 GB + "public"."work_rev"                                          | 3174 MB    | 2936 MB      | 6109 MB + "public"."file_rev_url"                                      | 3634 MB    | 1456 MB      | 5090 MB + "public"."file_rev"                                          | 792 MB     | 1281 MB      | 2073 MB + "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB + "public"."file_edit"                                         | 565 MB     | 561 MB       | 1126 MB + "public"."file_release"                                      | 380 MB     | 666 MB       | 1045 MB + "public"."file_ident"                                        | 415 MB     | 496 MB       | 911 MB + "public"."creator_rev"                                       | 371 MB     | 457 MB       | 828 MB + "public"."creator_edit"                                      | 347 MB     | 353 MB       | 700 MB + "public"."creator_ident"                                     | 255 MB     | 305 MB       | 559 MB + "public"."release_rev_abstract"                              | 183 MB     | 237 MB       | 421 MB + "public"."changelog"                                         | 122 MB     | 126 MB       | 247 MB + "public"."editgroup"                                         | 138 MB     | 81 MB        | 219 MB + "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB + "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB + "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB + +Continuing imports: + +    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + +    => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} diff --git a/notes/old_imports.txt b/notes/old_imports.txt new file mode 100644 index 00000000..1233d4a8 --- /dev/null +++ b/notes/old_imports.txt @@ -0,0 +1,20 @@ + +## ORCID + +Directly from compressed tarball; takes about 2 hours in production: + +    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid - + +After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted +tarball: + +    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json +    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +Does not work: + +    ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + +    cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid - | 
