1 files changed, 319 insertions, 0 deletions
diff --git a/notes/bootstrap/import_timing_20180910.txt b/notes/bootstrap/import_timing_20180910.txt
new file mode 100644
index 00000000..43c76e43
--- /dev/null
+++ b/notes/bootstrap/import_timing_20180910.txt
@@ -0,0 +1,319 @@
+
+Changes since last time:
+- removed "WHERE IS NOT NULL" from lookup indexes
+- some cleanups
+- JSONB
+
+## QA Timing
+
+master branch b15fa552e288bec5bbc2b07a3e11bab9235a1e7c
+
+    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv # --batch-size 100
+    real    0m30.249s
+    user    0m9.364s
+    sys     0m0.268s
+
+    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - # --batch-size 100
+    => something went sort of wrong? seemed to hang, with no activity.
+    => may have sent a signal or something accidentally?
+
+    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt --batch-size 100
+    => ran for a few seconds before restarting with increased batch size
+    => increased batch size seems to be too much for releases? stick with 50
+    => still lots of long autovacuum (ANALYZE)
+
+    67017.73user 2054.96system 25:59:38elapsed 73%CPU (0avgtext+0avgdata 424692maxresident)k
+    2085872inputs+219678920outputs (100457major+30837042minor)pagefaults 0swaps
+    # 25:59:38 elapsed
+
+    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite # --batch-size 100
+    => taking longer to start than I remembered...
+
+    Done! Inserted 6607075
+    real    1521m37.856s
+    user    305m38.444s
+    sys     18m13.916s
+    => 25.5 hours
+
+Disk/table usage:
+
+    Size:  233.95G
+
+                              table_name                          | table_size | indexes_size | total_size 
+    --------------------------------------------------------------+------------+--------------+------------
+     "public"."release_ref"                                       | 92 GB      | 26 GB        | 118 GB
+     "public"."release_rev"                                       | 30 GB      | 14 GB        | 45 GB
+     "public"."release_contrib"                                   | 12 GB      | 13 GB        | 26 GB
+     "public"."release_edit"                                      | 4710 MB    | 3353 MB      | 8062 MB
+     "public"."work_edit"                                         | 4710 MB    | 3352 MB      | 8062 MB
+     "public"."work_ident"                                        | 3454 MB    | 4153 MB      | 7608 MB
+     "public"."release_ident"                                     | 3454 MB    | 4152 MB      | 7606 MB
+     "public"."work_rev"                                          | 2241 MB    | 2076 MB      | 4317 MB
+     "public"."file_rev_url"                                      | 1994 MB    | 778 MB       | 2772 MB
+     "public"."file_rev"                                          | 827 MB     | 1215 MB      | 2042 MB
+     "public"."file_ident"                                        | 501 MB     | 533 MB       | 1034 MB
+     "public"."file_edit"                                         | 590 MB     | 411 MB       | 1001 MB
+     "public"."creator_rev"                                       | 371 MB     | 455 MB       | 826 MB
+     "public"."abstracts"                                         | 649 MB     | 52 MB        | 701 MB
+     "public"."file_release"                                      | 251 MB     | 435 MB       | 686 MB
+     "public"."creator_edit"                                      | 347 MB     | 242 MB       | 590 MB
+     "public"."creator_ident"                                     | 255 MB     | 305 MB       | 560 MB
+     "public"."release_rev_abstract"                              | 70 MB      | 90 MB        | 160 MB
+     "public"."changelog"                                         | 52 MB      | 53 MB        | 106 MB
+     "public"."editgroup"                                         | 60 MB      | 35 MB        | 94 MB
+     "public"."container_rev"                                     | 40 MB      | 27 MB        | 67 MB
+     "public"."container_edit"                                    | 24 MB      | 16 MB        | 39 MB
+     "public"."container_ident"                                   | 17 MB      | 20 MB        | 37 MB
+
+I think this is before the JSONB schema change, so won't reflect actual/final.
+
+Perf:
+
+           relname        | too_much_seq | case |  rel_size   | seq_scan | idx_scan  
+    ----------------------+--------------+------+-------------+----------+-----------
+     file_rev_url         |          -18 | OK   |  2089836544 |        2 |        20
+     file_release         |        -4001 | OK   |   263249920 |        2 |      4003
+     container_edit       |      -132140 | OK   |    24608768 |        2 |    132142
+     release_edit         |      -132140 | OK   |  4937408512 |        2 |    132142
+     work_edit            |      -132140 | OK   |  4937351168 |        2 |    132142
+     file_edit            |      -132140 | OK   |   618545152 |        2 |    132142
+     creator_edit         |      -132140 | OK   |   363954176 |        2 |    132142
+     changelog            |      -134456 | OK   |    54919168 |        5 |    134461
+     container_rev        |     -1276670 | OK   |    41680896 |   592993 |   1869663
+     abstracts            |     -1353093 | OK   |   669425664 |        3 |   1353096
+     release_ref          |     -3429019 | OK   | 99223470080 |        3 |   3429022
+     release_contrib      |     -3429026 | OK   | 13406437376 |        3 |   3429029
+     release_rev_abstract |     -3435251 | OK   |    73359360 |        3 |   3435254
+     creator_ident        |     -5360858 | OK   |   266977280 |       11 |   5360869
+     creator_rev          |     -8071846 | OK   |   388734976 |   257781 |   8329627
+     file_ident           |    -13478348 | OK   |   524746752 |        7 |  13478355
+     file_rev             |    -30368193 | OK   |   867401728 |        4 |  30368197
+     container_ident      |    -47140767 | OK   |    18219008 |        9 |  47140776
+     release_ident        |    -61021070 | OK   |  3620864000 |       24 |  61021094
+     work_rev             |   -106056502 | OK   |  2349146112 |        1 | 106056503
+     work_ident           |   -106320780 | OK   |  3620954112 |        6 | 106320786
+     editgroup            |   -117742455 | OK   |    62398464 |        2 | 117742457
+     release_rev          |   -563415274 | OK   | 32455860224 |        8 | 563415282
+
+Thought I had resoled those scans... hrm. Pretty sure (but not 100%) that I
+cleared the counters.
+
+Dumps!
+
+    time psql fatcat_prod < ../extra/quick_dump.sql
+    real    4m5.729s
+    user    0m0.056s
+    sys     0m0.016s
+    => while other stuff running
+
+    time ./ident_table_snapshot.sh
+    => seems very dominated by tar/xz compression time; should switch back to gzip
+
+    DATABASE_URL=$DATABASE_URL time ./ident_table_snapshot.sh /tmp
+    614.80user 31.74system 13:23.51elapsed 80%CPU (0avgtext+0avgdata 10124maxresident)k
+    8311456inputs+9761200outputs (2major+3792minor)pagefaults 0swaps
+    => 13.5 minutes
+    => fatcat_idents.2018-09-13.215147.r874951.tar.gz: 4.7GB
+    => uncompressed, 8.3GB total
+
+    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release -j8 -q | pv -l | gzip > fatcat_release_dump.json.gz
+    => can not run with more than -j8, it seems (postgres client limit)
+    => TPS: 12522 (!)
+    => 101k 0:00:34 [2.99k/s]
+    => estimate 6.5 hours
+
+    # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers
+    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz
+    => TPS: 29605 (!)
+    => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?)
+    => 341k 0:01:07 [3.48k/s] (ah, must have been in disk cache)
+    => 4.12M 0:16:11 (averaging 4.2k/s; estimate 4 hours total)
+    => fatcat_release_dump_expanded.json.gz: 3.4G with 6.76M lines, so expecting 35GB or so
+
+    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+     14   3  73   8   0   2|  43M   21M| 143k  326B|   0     0 |  64k  113k
+     18   4  72   5   0   2|  55M 4096k| 246k  326B|   0     0 |  84k  157k
+     17   4  72   6   0   2|  53M 9964k| 428k  908B|   0     0 |  84k  155k
+     14   3  73   8   0   1|  46M   20M| 143k   87k|   0     0 |  71k  132k
+     18   3  72   5   0   2|  52M 8880k| 318k   95k|   0     0 |  82k  149k
+
+    => fatcat-export using about 200% CPU in a single thread; other 8 threads at about 25% each
+        => might be interesting to profile... too many copies? switch to tsv library?
+    => seems like postgres only about 50% utilized
+
+Benchmarks!
+
+    # stub entity
+    $ wrk -t8 -c100 -d30s http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+    Running 30s test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+      8 threads and 100 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency     7.44ms    5.26ms  67.53ms   60.07%
+        Req/Sec     1.71k   113.55     2.07k    68.79%
+      408986 requests in 30.01s, 117.01MB read
+    Requests/sec:  13627.62
+    Transfer/sec:      3.90MB
+    => TPS: 39958
+    => lots of "BEGIN" and "SELECT 1", implying new connections? hrm.
+    => should run longer for more stable results
+
+    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+
+    Running 2m test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai
+      12 threads and 400 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency    25.75ms   17.02ms 263.74ms   65.11%
+        Req/Sec     1.05k   377.38     1.82k    74.81%
+      Latency Distribution
+         50%   25.40ms
+         75%   26.56ms
+         90%   49.57ms
+         99%   76.47ms
+      1128253 requests in 2.00m, 322.80MB read
+      Socket errors: connect 0, read 10, write 0, timeout 0
+    Requests/sec:   9396.03
+    Transfer/sec:      2.69MB
+
+    => single 470% fatcatd thread; diesel?
+    => 50% systemd-journald
+
+    # rich entity
+    $ wrk -t8 -c100 -d30s https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+    Running 30s test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+      8 threads and 100 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency    33.35ms   14.01ms 152.05ms   79.98%
+        Req/Sec   363.14     30.36   464.00     76.48%
+      86719 requests in 30.04s, 234.46MB read
+    Requests/sec:   2886.55
+    Transfer/sec:      7.80MB
+
+    wrk -t12 -c400 -d2m --latency https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+
+    Running 2m test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container
+      12 threads and 400 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency   149.19ms  125.30ms   1.93s    95.06%
+        Req/Sec   237.89     34.60   650.00     73.54%
+      Latency Distribution
+         50%  129.86ms
+         75%  134.14ms
+         90%  210.42ms
+         99%  904.26ms
+      340625 requests in 2.00m, 0.90GB read
+      Socket errors: connect 0, read 0, write 0, timeout 303
+    Requests/sec:   2836.17
+    Transfer/sec:      7.67MB
+
+    => TPS: 9090
+    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+     usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+      4   0  93   3   0   0|2401k   13M|   0     0 |2317B 4812B|4274  3824 
+     22   6  68   0   0   3|4096B    0 |  57k  956B|4096B    0 | 135k  255k
+     21   7  68   0   0   3|4096B    0 |  61k   66B|4096B    0 | 137k  259k
+     21   7  68   0   0   3|  16k 8192B| 179k  636B|  16k    0 | 134k  252k
+     21   7  68   0   0   3|   0     0 |  53k   66B|   0     0 | 135k  255k
+     22   6  68   0   0   3|   0    17M|  51k  326B|   0     0 | 137k  261k
+     22   6  68   0   0   3|   0     0 |  82k  903B|   0     0 | 136k  257k
+
+    # static (404)
+    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/asdf
+    => full 2600+% CPU in a single fatcat process (?)
+    => lots of load in wrk also
+    => system at a whole basically 100% CPU on all cores, though mostly kernel
+
+    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system--
+    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw 
+      4   0  93   3   0   0|2399k   13M|   0     0 |2315B 4808B|4359  3976 
+     26  66   2   0   0   6|   0     0 |4191k  956B|   0     0 | 465k  536k
+     26  67   1   0   0   6|   0     0 | 511k   66B|   0     0 | 466k  543k
+     27  64   3   0   0   6|   0     0 |2648k  318B|   0     0 | 481k  555k
+     25  67   2   0   0   6|   0     0 |1909k  768B|   0     0 | 481k  550k
+
+    Running 2m test @ http://localhost:9411/v0/asdf
+      12 threads and 400 connections
+      Thread Stats   Avg      Stdev     Max   +/- Stdev
+        Latency     1.40ms    1.65ms 219.48ms   93.33%
+        Req/Sec    17.71k     8.97k   65.69k    63.84%
+      Latency Distribution
+         50%    1.13ms
+         75%    1.89ms
+         90%    2.71ms
+         99%    4.76ms
+      21145472 requests in 2.00m, 1.61GB read
+      Socket errors: connect 0, read 12, write 0, timeout 0
+      Non-2xx or 3xx responses: 21145472
+    Requests/sec: 176071.79
+    Transfer/sec:     13.77MB
+
+Noting pretty non-trival traffic to 207.241.229.230 and
+es-worker-02.us.archive.org (207.241.229.246)... probably because this is a
+datanode?
+
+## Misc
+
+Postgres really wanted to log: <https://wiki.postgresql.org/wiki/Per-user_log_settings>
+
+    ALTER DATABASE fatcat_prod SET log_statement = 'ddl';
+    ALTER USER fatcat SET log_statement = 'ddl';
+
+
+## Later Imports
+
+    zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+    => only ~40 TPS
+    => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index?
+
+           relname        | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan 
+    ----------------------+--------------+----------------+-------------+----------+----------
+     file_rev             |     -5343173 | OK             |   867483648 |      693 |  5343866
+
+    EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
+
+Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success.
+
+    http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found
+    http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found
+        => but no releases?
+
+SET auto_explain.log_min_duration = 0;
+SET auto_explain.log_analyze = true;
+ALTER USER fatcat SET log_statement = 'all';
+ALTER USER fatcat SET auto_explain.log_min_duration = 0;
+ALTER USER fatcat SET auto_explain.log_analyze = true;;
+sudo service fatcat-api restart
+
+
+2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG:  duration: 3028.143 ms  plan:
+        Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3
+        Limit  (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1)
+          ->  Nested Loop  (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1)
+                ->  Seq Scan on file_rev  (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1)
+                      Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text))
+                      Rows Removed by Filter: 6607786
+                ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..1.92 rows=1 width=49) (never executed)
+                      Index Cond: (rev_id = file_rev.id)
+                      Filter: (is_live AND (redirect_id IS NULL))
+
+
+fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Limit  (cost=0.99..5.04 rows=1 width=454)
+   ->  Nested Loop  (cost=0.99..5.04 rows=1 width=454)
+         ->  Index Scan using file_rev_sha1_idx on file_rev  (cost=0.56..2.58 rows=1 width=405)
+               Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar))
+         ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..2.45 rows=1 width=49)
+               Index Cond: (rev_id = file_rev.id)
+               Filter: (is_live AND (redirect_id IS NULL))
+
+After all this debugging, asked on IRC and learned that I really shouldn't ever
+use CHAR() (or even VARCHAR()). Will update the schema.
+
+## Investigating Missing Rows
+
+    fatcat_prod=# select count(*) from release_ident; => 53028167
+
+Thought I had more than that previously? Table sizes were also bigger. Hrm.
+