diff options
Diffstat (limited to 'notes/import_timing_20180908.txt')
-rw-r--r-- | notes/import_timing_20180908.txt | 249 |
1 files changed, 249 insertions, 0 deletions
diff --git a/notes/import_timing_20180908.txt b/notes/import_timing_20180908.txt new file mode 100644 index 00000000..3091e4fa --- /dev/null +++ b/notes/import_timing_20180908.txt @@ -0,0 +1,249 @@ + +Changes since last time: +- auto-accept flag (potentially no need to UPDATE/vacuum database) +- large CRUD code refactor (but not complete removal of api_server CRUD yet) +- removed "single query insert per entity", but added row-based inserts + +## Laptop Rough Timing + +HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379: +=> note: in this branch, importer is sitll creating editgroups in separate request + + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 0m28.779s + user 0m5.716s + sys 0m0.208s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + + 89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k + 78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps + # 1:46.67 elapsed + + # running again + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 0m29.714s + user 0m6.048s + sys 0m0.212s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + 89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k + 0inputs+377456outputs (0major+583532minor)pagefaults 0swaps + # 1:48.11 elapsed + +MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a + + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 1m8.061s + user 0m7.384s + sys 0m0.280s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + 96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k + 0inputs+389168outputs (0major+586810minor)pagefaults 0swaps + # 3:23.05 elapsed + +AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510 + + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 1m7.757s + user 0m6.240s + sys 0m0.228s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + 91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k + 0inputs+384976outputs (0major+580544minor)pagefaults 0swaps + +So, annecdotally, seems like autoaccept didn't do much here (not +vacuum-limited?), but the row inserts were more than a factor of 2x performance +improvement. Great! Could try experimenting with even larger batch sizes... + + +## Production Import + +http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379 + + time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv + + real 0m30.845s + user 0m8.884s + sys 0m0.312s + + + time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + # TPS: 1181 + real 6m54.019s => down from 22m! + user 25m33.980s + sys 1m30.480s + + xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt + # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?) + # parallel at ~80% + # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction + # postgres still does a vacuum to analyze; good! + # ~600-1000 TPS near start (large variance) + # left to run overnight... + # slowed down to ~50-80 TPS about 10 hours later + # lots of IOWAIT + # only at 40309165 rows in the morning; this could take a long time + + # PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s + # Size: 153.71G - 1.90M/s | TPS: 77 + # Mem.: 68.70% - 22.60G/49.14G | IO Max: 20448/s + # Swap: 3.80% - 1.88G/50.00G | Read : 10.17M/s - 2603/s + # Load: 11.71 11.80 12.01 | Write: 4.00M/s - 1024/s + + 92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k + 1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps + # 35:45:15 elapsed + + time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite + +## Perf Tweaks + + + + SELECT + relname, + seq_scan - idx_scan AS too_much_seq, + CASE + WHEN + seq_scan - coalesce(idx_scan, 0) > 0 + THEN + 'Missing Index?' + ELSE + 'OK' + END, + pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan + FROM + pg_stat_all_tables + WHERE + schemaname = 'public' + AND pg_relation_size(relname::regclass) > 80000 + ORDER BY + too_much_seq DESC; + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+------+--------------+----------+----------- + file_rev_url | -1 | OK | 163323904 | 2 | 3 + file_release | -3 | OK | 24461312 | 2 | 5 + release_edit | -10265 | OK | 6080495616 | 2 | 10267 + container_edit | -10265 | OK | 31170560 | 2 | 10267 + work_edit | -10265 | OK | 6080364544 | 2 | 10267 + file_edit | -10265 | OK | 49111040 | 2 | 10267 + creator_edit | -10265 | OK | 330924032 | 2 | 10267 + changelog | -11692 | OK | 106668032 | 2 | 11694 + release_ref | -374197 | OK | 125437673472 | 3 | 374200 + release_contrib | -374202 | OK | 16835354624 | 3 | 374205 + release_rev_abstract | -374919 | OK | 162250752 | 3 | 374922 + file_ident | -1047172 | OK | 41926656 | 2 | 1047174 + container_rev | -1378943 | OK | 50356224 | 724612 | 2103555 <= + file_rev | -2407647 | OK | 68493312 | 4 | 2407651 + abstracts | -2765610 | OK | 1450901504 | 1 | 2765611 + creator_ident | -7127467 | OK | 242688000 | 2 | 7127469 + creator_rev | -7943007 | OK | 353370112 | 839155 | 8782162 <= + container_ident | -57488245 | OK | 23060480 | 2 | 57488247 + release_ident | -66085389 | OK | 4459159552 | 14 | 66085403 + work_rev | -130613391 | OK | 2892775424 | 1 | 130613392 + work_ident | -130633923 | OK | 4459192320 | 2 | 130633925 + editgroup | -136775970 | OK | 120561664 | 1 | 136775971 + release_rev | -718337215 | OK | 36850507776 | 8 | 718337223 + + +Slowest queries: + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + + SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3 + + SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping... + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + + +DEBUG: (while a file import is running) + +Creator lookup where row exists: + + time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663 + real 0m0.020s + user 0m0.004s + sys 0m0.004s + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + => (1 row) + Time: 0.988 ms + +Creator lookup where row doesn't exist: + + bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X + {"message":"Not found: 0000-0002-8867-166X"} + real 0m1.282s + user 0m0.008s + sys 0m0.000s + Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms) + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + => (0 rows) + Time: 0.810 ms + +------- + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + +from logs: + + execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + +NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :( + + + exact logs: + SET TIME ZONE 'UTC' + SET CLIENT_ENCODING TO 'UTF8' + SELECT 1 + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1' + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1'; + +CHANGES: +- restart fatcat-api +- restart postgresql +- select pg_stat_reset(); + +seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev. + + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref" | 117 GB | 34 GB | 151 GB + "public"."release_rev" | 34 GB | 9202 MB | 43 GB + "public"."release_contrib" | 16 GB | 17 GB | 33 GB + "public"."work_edit" | 5800 MB | 4033 MB | 9833 MB + "public"."release_edit" | 5800 MB | 4032 MB | 9833 MB + "public"."work_ident" | 4254 MB | 5102 MB | 9356 MB + "public"."release_ident" | 4254 MB | 5095 MB | 9349 MB + "public"."work_rev" | 2759 MB | 2545 MB | 5304 MB + "public"."abstracts" | 1417 MB | 114 MB | 1530 MB + "public"."creator_rev" | 337 MB | 277 MB | 614 MB + "public"."creator_edit" | 316 MB | 221 MB | 536 MB + "public"."creator_ident" | 232 MB | 279 MB | 511 MB + "public"."file_rev_url" | 260 MB | 101 MB | 361 MB + "public"."release_rev_abstract" | 155 MB | 200 MB | 355 MB + "public"."file_rev" | 109 MB | 124 MB | 233 MB + "public"."changelog" | 102 MB | 106 MB | 208 MB + "public"."editgroup" | 116 MB | 69 MB | 184 MB + "public"."file_ident" | 66 MB | 70 MB | 136 MB + "public"."file_edit" | 78 MB | 53 MB | 131 MB + "public"."file_release" | 38 MB | 64 MB | 103 MB + "public"."container_rev" | 48 MB | 26 MB | 74 MB + "public"."container_edit" | 30 MB | 20 MB | 50 MB + "public"."container_ident" | 22 MB | 26 MB | 48 MB + +CHANGES: +- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case. + |