aboutsummaryrefslogtreecommitdiffstats
path: root/notes/import_timing_20180908.txt
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2018-09-25 15:44:39 -0700
committerBryan Newbold <bnewbold@robocracy.org>2018-09-25 15:44:39 -0700
commit186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8 (patch)
treeb116678710d9a69df023f7dc90525d7563506891 /notes/import_timing_20180908.txt
parentc1391060cc4575365f81fc674e35d8c182ccde83 (diff)
downloadfatcat-186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8.tar.gz
fatcat-186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8.zip
start organizing notes into subdirectories
Diffstat (limited to 'notes/import_timing_20180908.txt')
-rw-r--r--notes/import_timing_20180908.txt249
1 files changed, 0 insertions, 249 deletions
diff --git a/notes/import_timing_20180908.txt b/notes/import_timing_20180908.txt
deleted file mode 100644
index 3091e4fa..00000000
--- a/notes/import_timing_20180908.txt
+++ /dev/null
@@ -1,249 +0,0 @@
-
-Changes since last time:
-- auto-accept flag (potentially no need to UPDATE/vacuum database)
-- large CRUD code refactor (but not complete removal of api_server CRUD yet)
-- removed "single query insert per entity", but added row-based inserts
-
-## Laptop Rough Timing
-
-HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379:
-=> note: in this branch, importer is sitll creating editgroups in separate request
-
- time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
- real 0m28.779s
- user 0m5.716s
- sys 0m0.208s
-
- cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
-
- 89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k
- 78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps
- # 1:46.67 elapsed
-
- # running again
- time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
- real 0m29.714s
- user 0m6.048s
- sys 0m0.212s
-
- cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
- 89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k
- 0inputs+377456outputs (0major+583532minor)pagefaults 0swaps
- # 1:48.11 elapsed
-
-MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a
-
- time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
- real 1m8.061s
- user 0m7.384s
- sys 0m0.280s
-
- cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
- 96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k
- 0inputs+389168outputs (0major+586810minor)pagefaults 0swaps
- # 3:23.05 elapsed
-
-AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510
-
- time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv
- real 1m7.757s
- user 0m6.240s
- sys 0m0.228s
-
- cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt
- 91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k
- 0inputs+384976outputs (0major+580544minor)pagefaults 0swaps
-
-So, annecdotally, seems like autoaccept didn't do much here (not
-vacuum-limited?), but the row inserts were more than a factor of 2x performance
-improvement. Great! Could try experimenting with even larger batch sizes...
-
-
-## Production Import
-
-http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379
-
- time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv
-
- real 0m30.845s
- user 0m8.884s
- sys 0m0.312s
-
-
- time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid -
- # TPS: 1181
- real 6m54.019s => down from 22m!
- user 25m33.980s
- sys 1m30.480s
-
- xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt
- # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?)
- # parallel at ~80%
- # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction
- # postgres still does a vacuum to analyze; good!
- # ~600-1000 TPS near start (large variance)
- # left to run overnight...
- # slowed down to ~50-80 TPS about 10 hours later
- # lots of IOWAIT
- # only at 40309165 rows in the morning; this could take a long time
-
- # PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s
- # Size: 153.71G - 1.90M/s | TPS: 77
- # Mem.: 68.70% - 22.60G/49.14G | IO Max: 20448/s
- # Swap: 3.80% - 1.88G/50.00G | Read : 10.17M/s - 2603/s
- # Load: 11.71 11.80 12.01 | Write: 4.00M/s - 1024/s
-
- 92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k
- 1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps
- # 35:45:15 elapsed
-
- time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite
-
-## Perf Tweaks
-
-
-
- SELECT
- relname,
- seq_scan - idx_scan AS too_much_seq,
- CASE
- WHEN
- seq_scan - coalesce(idx_scan, 0) > 0
- THEN
- 'Missing Index?'
- ELSE
- 'OK'
- END,
- pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan
- FROM
- pg_stat_all_tables
- WHERE
- schemaname = 'public'
- AND pg_relation_size(relname::regclass) > 80000
- ORDER BY
- too_much_seq DESC;
-
- relname | too_much_seq | case | rel_size | seq_scan | idx_scan
-----------------------+--------------+------+--------------+----------+-----------
- file_rev_url | -1 | OK | 163323904 | 2 | 3
- file_release | -3 | OK | 24461312 | 2 | 5
- release_edit | -10265 | OK | 6080495616 | 2 | 10267
- container_edit | -10265 | OK | 31170560 | 2 | 10267
- work_edit | -10265 | OK | 6080364544 | 2 | 10267
- file_edit | -10265 | OK | 49111040 | 2 | 10267
- creator_edit | -10265 | OK | 330924032 | 2 | 10267
- changelog | -11692 | OK | 106668032 | 2 | 11694
- release_ref | -374197 | OK | 125437673472 | 3 | 374200
- release_contrib | -374202 | OK | 16835354624 | 3 | 374205
- release_rev_abstract | -374919 | OK | 162250752 | 3 | 374922
- file_ident | -1047172 | OK | 41926656 | 2 | 1047174
- container_rev | -1378943 | OK | 50356224 | 724612 | 2103555 <=
- file_rev | -2407647 | OK | 68493312 | 4 | 2407651
- abstracts | -2765610 | OK | 1450901504 | 1 | 2765611
- creator_ident | -7127467 | OK | 242688000 | 2 | 7127469
- creator_rev | -7943007 | OK | 353370112 | 839155 | 8782162 <=
- container_ident | -57488245 | OK | 23060480 | 2 | 57488247
- release_ident | -66085389 | OK | 4459159552 | 14 | 66085403
- work_rev | -130613391 | OK | 2892775424 | 1 | 130613392
- work_ident | -130633923 | OK | 4459192320 | 2 | 130633925
- editgroup | -136775970 | OK | 120561664 | 1 | 136775971
- release_rev | -718337215 | OK | 36850507776 | 8 | 718337223
-
-
-Slowest queries:
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-
- SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3
-
- SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping...
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-
-
-DEBUG: (while a file import is running)
-
-Creator lookup where row exists:
-
- time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663
- real 0m0.020s
- user 0m0.004s
- sys 0m0.004s
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
- => (1 row)
- Time: 0.988 ms
-
-Creator lookup where row doesn't exist:
-
- bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X
- {"message":"Not found: 0000-0002-8867-166X"}
- real 0m1.282s
- user 0m0.008s
- sys 0m0.000s
- Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms)
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
- => (0 rows)
- Time: 0.810 ms
-
--------
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1;
-
-from logs:
-
- execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
-
-NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :(
-
-
- exact logs:
- SET TIME ZONE 'UTC'
- SET CLIENT_ENCODING TO 'UTF8'
- SELECT 1
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3
- parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1'
-
- SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1';
-
-CHANGES:
-- restart fatcat-api
-- restart postgresql
-- select pg_stat_reset();
-
-seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev.
-
-
- table_name | table_size | indexes_size | total_size
---------------------------------------------------------------+------------+--------------+------------
- "public"."release_ref" | 117 GB | 34 GB | 151 GB
- "public"."release_rev" | 34 GB | 9202 MB | 43 GB
- "public"."release_contrib" | 16 GB | 17 GB | 33 GB
- "public"."work_edit" | 5800 MB | 4033 MB | 9833 MB
- "public"."release_edit" | 5800 MB | 4032 MB | 9833 MB
- "public"."work_ident" | 4254 MB | 5102 MB | 9356 MB
- "public"."release_ident" | 4254 MB | 5095 MB | 9349 MB
- "public"."work_rev" | 2759 MB | 2545 MB | 5304 MB
- "public"."abstracts" | 1417 MB | 114 MB | 1530 MB
- "public"."creator_rev" | 337 MB | 277 MB | 614 MB
- "public"."creator_edit" | 316 MB | 221 MB | 536 MB
- "public"."creator_ident" | 232 MB | 279 MB | 511 MB
- "public"."file_rev_url" | 260 MB | 101 MB | 361 MB
- "public"."release_rev_abstract" | 155 MB | 200 MB | 355 MB
- "public"."file_rev" | 109 MB | 124 MB | 233 MB
- "public"."changelog" | 102 MB | 106 MB | 208 MB
- "public"."editgroup" | 116 MB | 69 MB | 184 MB
- "public"."file_ident" | 66 MB | 70 MB | 136 MB
- "public"."file_edit" | 78 MB | 53 MB | 131 MB
- "public"."file_release" | 38 MB | 64 MB | 103 MB
- "public"."container_rev" | 48 MB | 26 MB | 74 MB
- "public"."container_edit" | 30 MB | 20 MB | 50 MB
- "public"."container_ident" | 22 MB | 26 MB | 48 MB
-
-CHANGES:
-- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case.
-