summaryrefslogtreecommitdiffstats
path: root/notes/import_timing_20180908.txt
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-10 19:33:13 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-10 19:33:13 -0700
commit60b070103e80a83e062a57cefd0ba0a84fc3a4c0 (patch)
treeb7dfb4c734b59044e5e76ceeb61f5147f7b2af84 /notes/import_timing_20180908.txt
parent5647fd1c8c043a1e36af39a03b6495bc75b576dc (diff)
downloadfatcat-60b070103e80a83e062a57cefd0ba0a84fc3a4c0.tar.gz
fatcat-60b070103e80a83e062a57cefd0ba0a84fc3a4c0.zip
add import timing notes from weekend
Diffstat (limited to 'notes/import_timing_20180908.txt')
-rw-r--r--notes/import_timing_20180908.txt249
1 files changed, 249 insertions, 0 deletions
diff --git a/notes/import_timing_20180908.txt b/notes/import_timing_20180908.txt
new file mode 100644
index 00000000..3091e4fa
--- /dev/null
+++ b/notes/import_timing_20180908.txt
@@ -0,0 +1,249 @@
+
+Changes since last time:
+- auto-accept flag (potentially no need to UPDATE/vacuum database)
+- large CRUD code refactor (but not complete removal of api_server CRUD yet)
+- removed "single query insert per entity", but added row-based inserts
+
+## Laptop Rough Timing
+
+HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379:
+=> note: in this branch, importer is sitll creating editgroups in separate request
+
+ time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+ real 0m28.779s
+ user 0m5.716s
+ sys 0m0.208s
+
+ cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+
+ 89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k
+ 78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps
+ # 1:46.67 elapsed
+
+ # running again
+ time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+ real 0m29.714s
+ user 0m6.048s
+ sys 0m0.212s
+
+ cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+ 89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k
+ 0inputs+377456outputs (0major+583532minor)pagefaults 0swaps
+ # 1:48.11 elapsed
+
+MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a
+
+ time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+ real 1m8.061s
+ user 0m7.384s
+ sys 0m0.280s
+
+ cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+ 96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k
+ 0inputs+389168outputs (0major+586810minor)pagefaults 0swaps
+ # 3:23.05 elapsed
+
+AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510
+
+ time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
+ real 1m7.757s
+ user 0m6.240s
+ sys 0m0.228s
+
+ cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
+ 91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k
+ 0inputs+384976outputs (0major+580544minor)pagefaults 0swaps
+
+So, annecdotally, seems like autoaccept didn't do much here (not
+vacuum-limited?), but the row inserts were more than a factor of 2x performance
+improvement. Great! Could try experimenting with even larger batch sizes...
+
+
+## Production Import
+
+http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379
+
+ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
+
+ real 0m30.845s
+ user 0m8.884s
+ sys 0m0.312s
+
+
+ time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
+ # TPS: 1181
+ real 6m54.019s => down from 22m!
+ user 25m33.980s
+ sys 1m30.480s
+
+ xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
+ # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?)
+ # parallel at ~80%
+ # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction
+ # postgres still does a vacuum to analyze; good!
+ # ~600-1000 TPS near start (large variance)
+ # left to run overnight...
+ # slowed down to ~50-80 TPS about 10 hours later
+ # lots of IOWAIT
+ # only at 40309165 rows in the morning; this could take a long time
+
+ # PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s
+ # Size: 153.71G - 1.90M/s | TPS: 77
+ # Mem.: 68.70% - 22.60G/49.14G | IO Max: 20448/s
+ # Swap: 3.80% - 1.88G/50.00G | Read : 10.17M/s - 2603/s
+ # Load: 11.71 11.80 12.01 | Write: 4.00M/s - 1024/s
+
+ 92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k
+ 1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps
+ # 35:45:15 elapsed
+
+ time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
+
+## Perf Tweaks
+
+
+
+ SELECT
+ relname,
+ seq_scan - idx_scan AS too_much_seq,
+ CASE
+ WHEN
+ seq_scan - coalesce(idx_scan, 0) > 0
+ THEN
+ 'Missing Index?'
+ ELSE
+ 'OK'
+ END,
+ pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
+ FROM
+ pg_stat_all_tables
+ WHERE
+ schemaname = 'public'
+ AND pg_relation_size(relname::regclass) > 80000
+ ORDER BY
+ too_much_seq DESC;
+
+ relname | too_much_seq | case | rel_size | seq_scan | idx_scan
+----------------------+--------------+------+--------------+----------+-----------
+ file_rev_url | -1 | OK | 163323904 | 2 | 3
+ file_release | -3 | OK | 24461312 | 2 | 5
+ release_edit | -10265 | OK | 6080495616 | 2 | 10267
+ container_edit | -10265 | OK | 31170560 | 2 | 10267
+ work_edit | -10265 | OK | 6080364544 | 2 | 10267
+ file_edit | -10265 | OK | 49111040 | 2 | 10267
+ creator_edit | -10265 | OK | 330924032 | 2 | 10267
+ changelog | -11692 | OK | 106668032 | 2 | 11694
+ release_ref | -374197 | OK | 125437673472 | 3 | 374200
+ release_contrib | -374202 | OK | 16835354624 | 3 | 374205
+ release_rev_abstract | -374919 | OK | 162250752 | 3 | 374922
+ file_ident | -1047172 | OK | 41926656 | 2 | 1047174
+ container_rev | -1378943 | OK | 50356224 | 724612 | 2103555 <=
+ file_rev | -2407647 | OK | 68493312 | 4 | 2407651
+ abstracts | -2765610 | OK | 1450901504 | 1 | 2765611
+ creator_ident | -7127467 | OK | 242688000 | 2 | 7127469
+ creator_rev | -7943007 | OK | 353370112 | 839155 | 8782162 <=
+ container_ident | -57488245 | OK | 23060480 | 2 | 57488247
+ release_ident | -66085389 | OK | 4459159552 | 14 | 66085403
+ work_rev | -130613391 | OK | 2892775424 | 1 | 130613392
+ work_ident | -130633923 | OK | 4459192320 | 2 | 130633925
+ editgroup | -136775970 | OK | 120561664 | 1 | 136775971
+ release_rev | -718337215 | OK | 36850507776 | 8 | 718337223
+
+
+Slowest queries:
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+
+ SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3
+
+ SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping...
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+
+
+DEBUG: (while a file import is running)
+
+Creator lookup where row exists:
+
+ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663
+ real 0m0.020s
+ user 0m0.004s
+ sys 0m0.004s
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+ => (1 row)
+ Time: 0.988 ms
+
+Creator lookup where row doesn't exist:
+
+ bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X
+ {"message":"Not found: 0000-0002-8867-166X"}
+ real 0m1.282s
+ user 0m0.008s
+ sys 0m0.000s
+ Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms)
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+ => (0 rows)
+ Time: 0.810 ms
+
+-------
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
+
+from logs:
+
+ execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+
+NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :(
+
+
+ exact logs:
+ SET TIME ZONE 'UTC'
+ SET CLIENT_ENCODING TO 'UTF8'
+ SELECT 1
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
+ parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1'
+
+ SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1';
+
+CHANGES:
+- restart fatcat-api
+- restart postgresql
+- select pg_stat_reset();
+
+seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev.
+
+
+ table_name | table_size | indexes_size | total_size
+--------------------------------------------------------------+------------+--------------+------------
+ "public"."release_ref" | 117 GB | 34 GB | 151 GB
+ "public"."release_rev" | 34 GB | 9202 MB | 43 GB
+ "public"."release_contrib" | 16 GB | 17 GB | 33 GB
+ "public"."work_edit" | 5800 MB | 4033 MB | 9833 MB
+ "public"."release_edit" | 5800 MB | 4032 MB | 9833 MB
+ "public"."work_ident" | 4254 MB | 5102 MB | 9356 MB
+ "public"."release_ident" | 4254 MB | 5095 MB | 9349 MB
+ "public"."work_rev" | 2759 MB | 2545 MB | 5304 MB
+ "public"."abstracts" | 1417 MB | 114 MB | 1530 MB
+ "public"."creator_rev" | 337 MB | 277 MB | 614 MB
+ "public"."creator_edit" | 316 MB | 221 MB | 536 MB
+ "public"."creator_ident" | 232 MB | 279 MB | 511 MB
+ "public"."file_rev_url" | 260 MB | 101 MB | 361 MB
+ "public"."release_rev_abstract" | 155 MB | 200 MB | 355 MB
+ "public"."file_rev" | 109 MB | 124 MB | 233 MB
+ "public"."changelog" | 102 MB | 106 MB | 208 MB
+ "public"."editgroup" | 116 MB | 69 MB | 184 MB
+ "public"."file_ident" | 66 MB | 70 MB | 136 MB
+ "public"."file_edit" | 78 MB | 53 MB | 131 MB
+ "public"."file_release" | 38 MB | 64 MB | 103 MB
+ "public"."container_rev" | 48 MB | 26 MB | 74 MB
+ "public"."container_edit" | 30 MB | 20 MB | 50 MB
+ "public"."container_ident" | 22 MB | 26 MB | 48 MB
+
+CHANGES:
+- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case.
+