diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-25 15:44:39 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2018-09-25 15:44:39 -0700 | 
| commit | 186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8 (patch) | |
| tree | b116678710d9a69df023f7dc90525d7563506891 | |
| parent | c1391060cc4575365f81fc674e35d8c182ccde83 (diff) | |
| download | fatcat-186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8.tar.gz fatcat-186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8.zip | |
start organizing notes into subdirectories
| -rw-r--r-- | notes/bootstrap/import_timing_20180815.txt (renamed from notes/import_timing_20180815.txt) | 0 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180908.txt (renamed from notes/import_timing_20180908.txt) | 0 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180910.txt (renamed from notes/import_timing_20180910.txt) | 0 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180920.txt (renamed from notes/import_timing_20180920.txt) | 0 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180923.txt | 91 | ||||
| -rw-r--r-- | notes/bootstrap/initial_sources.txt (renamed from notes/initial_sources.txt) | 0 | ||||
| -rw-r--r-- | notes/bootstrap/old_imports.txt (renamed from notes/old_imports.txt) | 0 | ||||
| -rw-r--r-- | notes/import_timing_20180923.txt | 39 | ||||
| -rw-r--r-- | notes/performance/postgres_performance.txt (renamed from notes/postgres_performance.txt) | 0 | ||||
| -rw-r--r-- | notes/performance/speed.txt (renamed from notes/speed.txt) | 0 | ||||
| -rw-r--r-- | notes/schema/alignments.csv (renamed from notes/alignments.csv) | 0 | ||||
| -rw-r--r-- | notes/schema/alignments.txt (renamed from notes/alignments.txt) | 0 | ||||
| -rw-r--r-- | notes/schema/contrib_types.txt (renamed from notes/contrib_types.txt) | 0 | ||||
| -rw-r--r-- | notes/schema/work_release_types.txt (renamed from notes/work_release_types.txt) | 0 | 
14 files changed, 91 insertions, 39 deletions
| diff --git a/notes/import_timing_20180815.txt b/notes/bootstrap/import_timing_20180815.txt index 1206cc41..1206cc41 100644 --- a/notes/import_timing_20180815.txt +++ b/notes/bootstrap/import_timing_20180815.txt diff --git a/notes/import_timing_20180908.txt b/notes/bootstrap/import_timing_20180908.txt index 3091e4fa..3091e4fa 100644 --- a/notes/import_timing_20180908.txt +++ b/notes/bootstrap/import_timing_20180908.txt diff --git a/notes/import_timing_20180910.txt b/notes/bootstrap/import_timing_20180910.txt index 43c76e43..43c76e43 100644 --- a/notes/import_timing_20180910.txt +++ b/notes/bootstrap/import_timing_20180910.txt diff --git a/notes/import_timing_20180920.txt b/notes/bootstrap/import_timing_20180920.txt index a57ffd77..a57ffd77 100644 --- a/notes/import_timing_20180920.txt +++ b/notes/bootstrap/import_timing_20180920.txt diff --git a/notes/bootstrap/import_timing_20180923.txt b/notes/bootstrap/import_timing_20180923.txt new file mode 100644 index 00000000..c7161842 --- /dev/null +++ b/notes/bootstrap/import_timing_20180923.txt @@ -0,0 +1,91 @@ + +    105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k +    71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps + +    real    959m39.521s +    user    1845m10.392s +    sys     70m33.780s + +Did I get the same error again? I'm confused: + +    HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} +    (but not in all threads) + +Yes, ugh, because 50*2500 can be over (it's not just individual large releases, +they come in big batches). + +But: + +    select count(id) from release_ident; => 70006121 + +A lot, though not 72 million like last time, hrm. I'm... going to move ahead I +guess. + +"Processed 4440850 lines, inserted 3509600, updated 0." +    => implies 79029915 records + +    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - +    Processed 530750 lines, inserted 435239, updated 0. (etc) +    Command exited with non-zero status 1 +    15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k +    127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps + +    real    143m52.681s +    user    252m31.620s +    sys     11m21.608s + +    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + +    Processed 485200 lines, inserted 244101, updated 168344. (etc) +    22671.44user 1069.84system 3:27:47elapsed 190%CPU (0avgtext+0avgdata 39348maxresident)k +    99672inputs+2497848outputs (109major+422150minor)pagefaults 0swaps + +fatcat-export dump: + +     INFO 2018-09-25T10:01:06Z: fatcat_export: Done reading (70006121 lines), waiting for workers to exit... +      197GiB 4:56:17 [11.4MiB/s] [                                   <=>                                                     ] + +How big is everything? + +    select count(*) from file_release; => 10,485,964 +    select count (distinct target_release_ident_id) from file_release; => 6,486,934 +    select count(id) from release_ident; => 70,006,121 +    select count(*) from container_ident; => 354,793 +    select count(*) from creator_ident; => 3,906,990 +    Size:  324.24G +    /dev/vda1       1.8T  511G  1.2T  31% / + +                          table_name                          | table_size | indexes_size | total_size  +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref"                                       | 121 GB     | 42 GB        | 163 GB + "public"."release_rev"                                       | 33 GB      | 19 GB        | 52 GB + "public"."release_contrib"                                   | 21 GB      | 18 GB        | 39 GB + "public"."release_edit"                                      | 6218 MB    | 6084 MB      | 12 GB + "public"."work_edit"                                         | 6218 MB    | 6084 MB      | 12 GB + "public"."release_ident"                                     | 4560 MB    | 5470 MB      | 10030 MB + "public"."work_ident"                                        | 4560 MB    | 5466 MB      | 10027 MB + "public"."file_rev_url"                                      | 5543 MB    | 2112 MB      | 7655 MB + "public"."work_rev"                                          | 2958 MB    | 2733 MB      | 5691 MB + "public"."file_rev"                                          | 1201 MB    | 1811 MB      | 3012 MB + "public"."abstracts"                                         | 2294 MB    | 184 MB       | 2478 MB + "public"."file_edit"                                         | 931 MB     | 864 MB       | 1795 MB + "public"."file_release"                                      | 605 MB     | 1058 MB      | 1663 MB + "public"."file_ident"                                        | 529 MB     | 633 MB       | 1162 MB + "public"."creator_rev"                                       | 371 MB     | 456 MB       | 826 MB + "public"."creator_edit"                                      | 347 MB     | 352 MB       | 699 MB + "public"."release_rev_abstract"                              | 250 MB     | 325 MB       | 575 MB + "public"."creator_ident"                                     | 255 MB     | 304 MB       | 559 MB + "public"."changelog"                                         | 122 MB     | 127 MB       | 250 MB + "public"."editgroup"                                         | 138 MB     | 82 MB        | 220 MB + "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB + "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB + "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB + +Hrm, bunch of not-accepted containers: + +    select count(*) from container_ident where is_live='f'; => 301507 +    select count(*) from release_ident where is_live='f'; => 0 +    select count(*) from work_ident where is_live='f'; => 0 +    select count(*) from creator_ident where is_live='f'; => 1 (there was a hang earlier) +    select count(*) from file_ident where is_live='f'; => 0 + diff --git a/notes/initial_sources.txt b/notes/bootstrap/initial_sources.txt index cc22019d..cc22019d 100644 --- a/notes/initial_sources.txt +++ b/notes/bootstrap/initial_sources.txt diff --git a/notes/old_imports.txt b/notes/bootstrap/old_imports.txt index 1233d4a8..1233d4a8 100644 --- a/notes/old_imports.txt +++ b/notes/bootstrap/old_imports.txt diff --git a/notes/import_timing_20180923.txt b/notes/import_timing_20180923.txt deleted file mode 100644 index f8814f3d..00000000 --- a/notes/import_timing_20180923.txt +++ /dev/null @@ -1,39 +0,0 @@ - -    105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k -    71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps - -    real    959m39.521s -    user    1845m10.392s -    sys     70m33.780s - -Did I get the same error again? I'm confused: - -    HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} -    (but not in all threads) - -Yes, ugh, because 50*2500 can be over (it's not just individual large releases, -they come in big batches). - -But: - -    select count(id) from release_ident; => 70006121 - -A lot, though not 72 million like last time, hrm. I'm... going to move ahead I -guess. - -"Processed 4440850 lines, inserted 3509600, updated 0." -    => implies 79029915 records - -    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - -    Processed 530750 lines, inserted 435239, updated 0. (etc) -    Command exited with non-zero status 1 -    15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k -    127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps - -    real    143m52.681s -    user    252m31.620s -    sys     11m21.608s - -    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - - -    (running...) diff --git a/notes/postgres_performance.txt b/notes/performance/postgres_performance.txt index cd2a5162..cd2a5162 100644 --- a/notes/postgres_performance.txt +++ b/notes/performance/postgres_performance.txt diff --git a/notes/speed.txt b/notes/performance/speed.txt index f885aea7..f885aea7 100644 --- a/notes/speed.txt +++ b/notes/performance/speed.txt diff --git a/notes/alignments.csv b/notes/schema/alignments.csv index b8619ddc..b8619ddc 100644 --- a/notes/alignments.csv +++ b/notes/schema/alignments.csv diff --git a/notes/alignments.txt b/notes/schema/alignments.txt index e2736268..e2736268 100644 --- a/notes/alignments.txt +++ b/notes/schema/alignments.txt diff --git a/notes/contrib_types.txt b/notes/schema/contrib_types.txt index 01024b40..01024b40 100644 --- a/notes/contrib_types.txt +++ b/notes/schema/contrib_types.txt diff --git a/notes/work_release_types.txt b/notes/schema/work_release_types.txt index 6eff118b..6eff118b 100644 --- a/notes/work_release_types.txt +++ b/notes/schema/work_release_types.txt | 
