diff options
Diffstat (limited to 'notes/bootstrap')
| -rw-r--r-- | notes/bootstrap/import_timing_20180815.txt | 292 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180908.txt | 249 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180910.txt | 319 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180920.txt | 96 | ||||
| -rw-r--r-- | notes/bootstrap/import_timing_20180923.txt | 91 | ||||
| -rw-r--r-- | notes/bootstrap/initial_sources.txt | 26 | ||||
| -rw-r--r-- | notes/bootstrap/old_imports.txt | 20 | 
7 files changed, 1093 insertions, 0 deletions
| diff --git a/notes/bootstrap/import_timing_20180815.txt b/notes/bootstrap/import_timing_20180815.txt new file mode 100644 index 00000000..1206cc41 --- /dev/null +++ b/notes/bootstrap/import_timing_20180815.txt @@ -0,0 +1,292 @@ + +Schema changes since previous imports: +- more fields (identifiers+indexes) +- timestamps +- UUIDs more places +- fixed some crossref import bugs? +- abstracts +- file_urls as table (not single value) +- timestamps +- TEXT -> CHAR in a few places +- removed many work fields + +## Containers + +(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv + +real    1m25.292s +user    0m12.640s +sys     0m0.412s + +## Creators + +time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*) +real    22m2.465s +user    26m41.924s +sys     1m33.844s + +## Releases + +xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt + +    128516.30 user +    3905.14 system +    44:17:05 elapsed +    83% CPU + +Almost 44 hours... I think I remember more like 36 hours last time? Things +slowed down a lot towards the end, many more ORCID cross-references? + +looking in htop, postgres seems to be primary bottleneck. At something like 12 +hours in, had 44 million release_ident rows, which is 1000/second. + +Note: seems like the more frequently `count(*)` is run, the more performant. +Because in-memory? + +    2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG:  duration: 42949.549 ms  statement: select count(id) from release_ident; + +    fatcat_prod=# select count(*) from release_ident; +    count    +    ---------- +    44185608 +    (1 row) + +    Time: 2753.916 ms (00:02.754) +    fatcat_prod=# select count(*) from release_ident; +    count    +    ---------- +    44187937 +    (1 row) + +    Time: 2711.670 ms (00:02.712) + +As expected, autovacuum very busy. Only ~150 TPS; but that includes batch +writes? 75061172 rows. + +## Files + +    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite + +    Done! Inserted 6607075 + +    real    2152m28.822s => 36 hours (!) +    user    401m46.464s +    sys     21m45.724s + + +Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table? + +    SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce'; + +Created index: +     +    CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev); + +... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't +see non-indexed queries. Maybe an ANALYZE does need to happen? + +This being single-threaded is going to be a problem in the future. ~50 million +files would be ~2 weeks. + +## Post-Import Status + +    Size:  358.89G (postgres self-reported) +    Mem.:   57.10% -    16.85G/49.14G + +Was 184G last time in late June; doubled in size (!). + +    bnewbold@wbgrp-svc500$ df -h / +    Filesystem      Size  Used Avail Use% Mounted on +    /dev/vda1       858G  529G  286G  65% / + +    bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/ +    361G    /var/lib/postgresql/ +    83G     /srv/datasets/ +    77G     /srv/elastic-blah/ + +    fatcat_prod=# select count(*) from changelog; => 2,085,067 + +    SELECT +        table_name, +        pg_size_pretty(table_size) AS table_size, +        pg_size_pretty(indexes_size) AS indexes_size, +        pg_size_pretty(total_size) AS total_size +    FROM ( +        SELECT +            table_name, +            pg_table_size(table_name) AS table_size, +            pg_indexes_size(table_name) AS indexes_size, +            pg_total_relation_size(table_name) AS total_size +        FROM ( +            SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name +            FROM information_schema.tables +        ) AS all_tables +        ORDER BY total_size DESC +    ) AS pretty_sizes; + +                          table_name                          | table_size | indexes_size | total_size  +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref"                                       | 159 GB     | 47 GB        | 206 GB + "public"."release_rev"                                       | 40 GB      | 10 GB        | 51 GB + "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB + "public"."release_ident"                                     | 5797 MB    | 6597 MB      | 12 GB + "public"."work_ident"                                        | 5787 MB    | 6394 MB      | 12 GB + "public"."release_edit"                                      | 6674 MB    | 4646 MB      | 11 GB + "public"."work_edit"                                         | 6674 MB    | 4646 MB      | 11 GB + "public"."work_rev"                                          | 3175 MB    | 2939 MB      | 6114 MB + "public"."file_rev_url"                                      | 1995 MB    | 275 MB       | 2270 MB + "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB + "public"."file_rev"                                          | 829 MB     | 954 MB       | 1783 MB + "public"."file_ident"                                        | 498 MB     | 532 MB       | 1030 MB + "public"."file_release"                                      | 369 MB     | 642 MB       | 1011 MB + "public"."file_edit"                                         | 591 MB     | 410 MB       | 1002 MB + "public"."creator_rev"                                       | 337 MB     | 318 MB       | 655 MB + "public"."creator_ident"                                     | 280 MB     | 297 MB       | 577 MB + "public"."creator_edit"                                      | 316 MB     | 220 MB       | 536 MB + "public"."release_rev_abstract"                              | 183 MB     | 84 MB        | 267 MB + "public"."changelog"                                         | 123 MB     | 125 MB       | 249 MB + "public"."editgroup"                                         | 139 MB     | 81 MB        | 220 MB + "public"."container_rev"                                     | 19 MB      | 6912 kB      | 26 MB + "public"."container_ident"                                   | 6896 kB    | 7016 kB      | 14 MB + "public"."container_edit"                                    | 8056 kB    | 5240 kB      | 13 MB + +In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB. + +For many of these indexes, and the _ident tables, switching from UUID to +BIGSERIAL would half the size. + +## Exports + +    time ./fatcat_export.py changelog - | pv -l | wc + +    As of: + +    159k 1:17:35 [34.3 /s] +    159,740 lines +    2,427,277,881 chars (bytes; 2.4GB) + +    real    77m35.183s +    user    15m36.208s +    sys     0m31.484s + +Running at about 100/sec; estimate 6 hours for completion. Could shard using +start/end flags, but am not here. + +Running `quick_dump.sql` (identifier tables, in a transaction): + +    251M Aug 19 23:08 fatcat_ident_creators.tsv +    5.9M Aug 19 23:08 fatcat_ident_containers.tsv +    467M Aug 19 23:08 fatcat_ident_files.tsv +    5.2G Aug 19 23:10 fatcat_ident_releases.tsv +    5.2G Aug 19 23:11 fatcat_ident_works.tsv +     12K Aug 19 23:11 . +    1.8G Aug 19 23:12 fatcat_abstracts.json + +Work and Release tables in under 2 minutes each; say 5 minutes total. + +    time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc + +    172k 1:07:08 [42.7 /s] +    172181 lines +    1,118,166,293 chars (bytes; 1.1 GB) + +    real    67m8.340s +    user    10m21.988s +    sys     0m34.612s + +Running at only 10/sec or so, this would take forever even if sharded. :( + +Both exports/dumps are running in parallel. "Expand" queries might help with speed? + +## Postgres Analysis + +SELECT * +FROM +  pg_stat_statements +ORDER BY +  total_time DESC LIMIT 5; + +Summary: + +    SELECT "creator_ident" by ORCID +        1,295,864 calls +        930,305,208 total time +        717.9 mean time     <= this should be less than a ms! + +    INSERT INTO release_rev +        75144055 calls +        111470961 total time +        1.483 mean time + +    INSERT INTO work_rev +        75,144,055 calls +        82693994 total time +        1.1 mean time + +    INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING * +        26,008,280 calls    <= why so few? different query depending on number +                               of rows inserted +        18955782 total time +        0.728 mean time + +    SELECT container_ident +        78,4143 calls +        17683156 total time +        22.55 mean time     <= why so slow? + +    INSERT INTO release_contrib  +        15,072,820 calls + +    INSERT INTO "release_contrib + + +       relname        | too_much_seq |      case      |   rel_size   | seq_scan | idx_scan   +----------------------+--------------+----------------+--------------+----------+----------- + file_rev_url         |         2391 | Missing Index? |   2091147264 |     2391 |         0 + file_release         |       -30670 | OK             |    386899968 |        2 |     30672 + container_rev        |      -979948 | OK             |     20242432 |   784146 |   1764094 + file_edit            |     -2206807 | OK             |    619896832 |        6 |   2206813 + creator_edit         |     -2206810 | OK             |    331079680 |       11 |   2206821 + work_edit            |     -2206811 | OK             |   6996566016 |       14 |   2206825 + release_edit         |     -2206811 | OK             |   6996582400 |       14 |   2206825 + container_edit       |     -2206816 | OK             |      8216576 |        5 |   2206821 + changelog            |     -2209659 | OK             |    129286144 |       10 |   2209669 + abstracts            |     -3486466 | OK             |   1706237952 |        8 |   3486474 + release_rev_abstract |     -4975493 | OK             |    191602688 |    42919 |   5018412 + release_ref          |     -5032717 | OK             | 170494861312 |        3 |   5032720 + release_contrib      |     -5032744 | OK             |  20370251776 |        3 |   5032747 + creator_rev          |     -8400410 | OK             |    353583104 |  1296507 |   9696917 + file_ident           |    -13483224 | OK             |    522190848 |        7 |  13483231 + creator_ident        |    -16686744 | OK             |    293625856 |        3 |  16686747 + file_rev             |    -32405557 | OK             |    868515840 |        4 |  32405561 + container_ident      |    -69162337 | OK             |      7028736 |        3 |  69162340 + work_rev             |   -150288161 | OK             |   3328589824 |        1 | 150288162 + editgroup            |   -162783807 | OK             |    146112512 |        9 | 162783816 + release_ident        |   -165676917 | OK             |   6076841984 |       52 | 165676969 + work_ident           |   -229439828 | OK             |   6066814976 |        3 | 229439831 + release_rev          |   -930140217 | OK             |  43360542720 |        9 | 930140226 + +TODO changes: +- don't return all as often; in particular, inserting release_contrib, release_ref +x missing an index somewhere on file_rev_url, release_rev_abstract +x why so many seq_scan on container_rev, creator_rev +    => running/EXPLAIN same query on psql hits index, not seq_scan +    => seemed to be an issue with VALUE params getting sent separately; query +    planner only looked at query and wasn't using index on ORCID/ISSN-L because +    it didn't know those values were not-NULL? +    => adding NOT NULL to query seems to have sped up case of there being a +    "hit", but no hit still slow. might need to change indices or something for +    the (perhaps common in future) case of DOI lookups with invalid DOIs (eg, +    CORE import) + +random DEBUG queries: + +    EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + +    EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1; + +    EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1; + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; diff --git a/notes/bootstrap/import_timing_20180908.txt b/notes/bootstrap/import_timing_20180908.txt new file mode 100644 index 00000000..3091e4fa --- /dev/null +++ b/notes/bootstrap/import_timing_20180908.txt @@ -0,0 +1,249 @@ + +Changes since last time: +- auto-accept flag (potentially no need to UPDATE/vacuum database) +- large CRUD code refactor (but not complete removal of api_server CRUD yet) +- removed "single query insert per entity", but added row-based inserts + +## Laptop Rough Timing + +HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379: +=> note: in this branch, importer is sitll creating editgroups in separate request + +    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv +    real    0m28.779s +    user    0m5.716s +    sys     0m0.208s + +    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + +    89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k +    78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps +    # 1:46.67 elapsed + +    # running again +    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv +    real    0m29.714s +    user    0m6.048s +    sys     0m0.212s + +    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt +    89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k +    0inputs+377456outputs (0major+583532minor)pagefaults 0swaps +    # 1:48.11 elapsed + +MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a + +    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv +    real    1m8.061s +    user    0m7.384s +    sys     0m0.280s + +    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt +    96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k +    0inputs+389168outputs (0major+586810minor)pagefaults 0swaps +    # 3:23.05 elapsed + +AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510 + +    time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv +    real    1m7.757s +    user    0m6.240s +    sys     0m0.228s + +    cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt +    91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k +    0inputs+384976outputs (0major+580544minor)pagefaults 0swaps + +So, annecdotally, seems like autoaccept didn't do much here (not +vacuum-limited?), but the row inserts were more than a factor of 2x performance +improvement. Great! Could try experimenting with even larger batch sizes... + + +## Production Import + +http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379 + +    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv + +    real    0m30.845s +    user    0m8.884s +    sys     0m0.312s + + +    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - +    # TPS: 1181 +    real    6m54.019s => down from 22m! +    user    25m33.980s +    sys     1m30.480s + +    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt +    # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?) +    # parallel at ~80% +    # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction +    # postgres still does a vacuum to analyze; good! +    # ~600-1000 TPS near start (large variance) +    # left to run overnight... +    # slowed down to ~50-80 TPS about 10 hours later +    # lots of IOWAIT +    # only at 40309165 rows in the morning; this could take a long time + +    #   PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s +    #   Size:  153.71G -     1.90M/s        | TPS:          77 +    #   Mem.:   68.70% -    22.60G/49.14G   | IO Max:    20448/s +    #   Swap:    3.80% -     1.88G/50.00G   | Read :     10.17M/s -   2603/s +    #   Load:    11.71 11.80 12.01          | Write:      4.00M/s -   1024/s + +    92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k +    1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps +    # 35:45:15 elapsed + +    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite + +## Perf Tweaks + + + +    SELECT +      relname, +      seq_scan - idx_scan AS too_much_seq, +      CASE +        WHEN +          seq_scan - coalesce(idx_scan, 0) > 0 +        THEN +          'Missing Index?' +        ELSE +          'OK' +      END, +      pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan +    FROM +      pg_stat_all_tables +    WHERE +      schemaname = 'public' +      AND pg_relation_size(relname::regclass) > 80000 +    ORDER BY +      too_much_seq DESC; + +       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan   +----------------------+--------------+------+--------------+----------+----------- + file_rev_url         |           -1 | OK   |    163323904 |        2 |         3 + file_release         |           -3 | OK   |     24461312 |        2 |         5 + release_edit         |       -10265 | OK   |   6080495616 |        2 |     10267 + container_edit       |       -10265 | OK   |     31170560 |        2 |     10267 + work_edit            |       -10265 | OK   |   6080364544 |        2 |     10267 + file_edit            |       -10265 | OK   |     49111040 |        2 |     10267 + creator_edit         |       -10265 | OK   |    330924032 |        2 |     10267 + changelog            |       -11692 | OK   |    106668032 |        2 |     11694 + release_ref          |      -374197 | OK   | 125437673472 |        3 |    374200 + release_contrib      |      -374202 | OK   |  16835354624 |        3 |    374205 + release_rev_abstract |      -374919 | OK   |    162250752 |        3 |    374922 + file_ident           |     -1047172 | OK   |     41926656 |        2 |   1047174 + container_rev        |     -1378943 | OK   |     50356224 |   724612 |   2103555   <= + file_rev             |     -2407647 | OK   |     68493312 |        4 |   2407651 + abstracts            |     -2765610 | OK   |   1450901504 |        1 |   2765611 + creator_ident        |     -7127467 | OK   |    242688000 |        2 |   7127469 + creator_rev          |     -7943007 | OK   |    353370112 |   839155 |   8782162   <= + container_ident      |    -57488245 | OK   |     23060480 |        2 |  57488247 + release_ident        |    -66085389 | OK   |   4459159552 |       14 |  66085403 + work_rev             |   -130613391 | OK   |   2892775424 |        1 | 130613392 + work_ident           |   -130633923 | OK   |   4459192320 |        2 | 130633925 + editgroup            |   -136775970 | OK   |    120561664 |        1 | 136775971 + release_rev          |   -718337215 | OK   |  36850507776 |        8 | 718337223 + + +Slowest queries: + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + +    SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3 + +    SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping... + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + + +DEBUG: (while a file import is running) + +Creator lookup where row exists: + +    time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663 +    real    0m0.020s +    user    0m0.004s +    sys     0m0.004s + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; +    => (1 row) +    Time: 0.988 ms +     +Creator lookup where row doesn't exist: + +    bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X +    {"message":"Not found: 0000-0002-8867-166X"} +    real    0m1.282s +    user    0m0.008s +    sys     0m0.000s +    Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms) + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; +    => (0 rows) +    Time: 0.810 ms + +------- + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + +from logs: + +    execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + +NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :( + + +    exact logs: +    SET TIME ZONE 'UTC' +    SET CLIENT_ENCODING TO 'UTF8' +    SELECT 1 +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 +    parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1' + +    SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1'; + +CHANGES: +- restart fatcat-api +- restart postgresql +- select pg_stat_reset(); + +seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev. + + +                          table_name                          | table_size | indexes_size | total_size  +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref"                                       | 117 GB     | 34 GB        | 151 GB + "public"."release_rev"                                       | 34 GB      | 9202 MB      | 43 GB + "public"."release_contrib"                                   | 16 GB      | 17 GB        | 33 GB + "public"."work_edit"                                         | 5800 MB    | 4033 MB      | 9833 MB + "public"."release_edit"                                      | 5800 MB    | 4032 MB      | 9833 MB + "public"."work_ident"                                        | 4254 MB    | 5102 MB      | 9356 MB + "public"."release_ident"                                     | 4254 MB    | 5095 MB      | 9349 MB + "public"."work_rev"                                          | 2759 MB    | 2545 MB      | 5304 MB + "public"."abstracts"                                         | 1417 MB    | 114 MB       | 1530 MB + "public"."creator_rev"                                       | 337 MB     | 277 MB       | 614 MB + "public"."creator_edit"                                      | 316 MB     | 221 MB       | 536 MB + "public"."creator_ident"                                     | 232 MB     | 279 MB       | 511 MB + "public"."file_rev_url"                                      | 260 MB     | 101 MB       | 361 MB + "public"."release_rev_abstract"                              | 155 MB     | 200 MB       | 355 MB + "public"."file_rev"                                          | 109 MB     | 124 MB       | 233 MB + "public"."changelog"                                         | 102 MB     | 106 MB       | 208 MB + "public"."editgroup"                                         | 116 MB     | 69 MB        | 184 MB + "public"."file_ident"                                        | 66 MB      | 70 MB        | 136 MB + "public"."file_edit"                                         | 78 MB      | 53 MB        | 131 MB + "public"."file_release"                                      | 38 MB      | 64 MB        | 103 MB + "public"."container_rev"                                     | 48 MB      | 26 MB        | 74 MB + "public"."container_edit"                                    | 30 MB      | 20 MB        | 50 MB + "public"."container_ident"                                   | 22 MB      | 26 MB        | 48 MB + +CHANGES: +- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case. + diff --git a/notes/bootstrap/import_timing_20180910.txt b/notes/bootstrap/import_timing_20180910.txt new file mode 100644 index 00000000..43c76e43 --- /dev/null +++ b/notes/bootstrap/import_timing_20180910.txt @@ -0,0 +1,319 @@ + +Changes since last time: +- removed "WHERE IS NOT NULL" from lookup indexes +- some cleanups +- JSONB + +## QA Timing + +master branch b15fa552e288bec5bbc2b07a3e11bab9235a1e7c + +    time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv # --batch-size 100 +    real    0m30.249s +    user    0m9.364s +    sys     0m0.268s + +    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - # --batch-size 100 +    => something went sort of wrong? seemed to hang, with no activity. +    => may have sent a signal or something accidentally? + +    xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt --batch-size 100 +    => ran for a few seconds before restarting with increased batch size +    => increased batch size seems to be too much for releases? stick with 50 +    => still lots of long autovacuum (ANALYZE) + +    67017.73user 2054.96system 25:59:38elapsed 73%CPU (0avgtext+0avgdata 424692maxresident)k +    2085872inputs+219678920outputs (100457major+30837042minor)pagefaults 0swaps +    # 25:59:38 elapsed + +    time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite # --batch-size 100 +    => taking longer to start than I remembered... + +    Done! Inserted 6607075 +    real    1521m37.856s +    user    305m38.444s +    sys     18m13.916s +    => 25.5 hours + +Disk/table usage: + +    Size:  233.95G + +                              table_name                          | table_size | indexes_size | total_size  +    --------------------------------------------------------------+------------+--------------+------------ +     "public"."release_ref"                                       | 92 GB      | 26 GB        | 118 GB +     "public"."release_rev"                                       | 30 GB      | 14 GB        | 45 GB +     "public"."release_contrib"                                   | 12 GB      | 13 GB        | 26 GB +     "public"."release_edit"                                      | 4710 MB    | 3353 MB      | 8062 MB +     "public"."work_edit"                                         | 4710 MB    | 3352 MB      | 8062 MB +     "public"."work_ident"                                        | 3454 MB    | 4153 MB      | 7608 MB +     "public"."release_ident"                                     | 3454 MB    | 4152 MB      | 7606 MB +     "public"."work_rev"                                          | 2241 MB    | 2076 MB      | 4317 MB +     "public"."file_rev_url"                                      | 1994 MB    | 778 MB       | 2772 MB +     "public"."file_rev"                                          | 827 MB     | 1215 MB      | 2042 MB +     "public"."file_ident"                                        | 501 MB     | 533 MB       | 1034 MB +     "public"."file_edit"                                         | 590 MB     | 411 MB       | 1001 MB +     "public"."creator_rev"                                       | 371 MB     | 455 MB       | 826 MB +     "public"."abstracts"                                         | 649 MB     | 52 MB        | 701 MB +     "public"."file_release"                                      | 251 MB     | 435 MB       | 686 MB +     "public"."creator_edit"                                      | 347 MB     | 242 MB       | 590 MB +     "public"."creator_ident"                                     | 255 MB     | 305 MB       | 560 MB +     "public"."release_rev_abstract"                              | 70 MB      | 90 MB        | 160 MB +     "public"."changelog"                                         | 52 MB      | 53 MB        | 106 MB +     "public"."editgroup"                                         | 60 MB      | 35 MB        | 94 MB +     "public"."container_rev"                                     | 40 MB      | 27 MB        | 67 MB +     "public"."container_edit"                                    | 24 MB      | 16 MB        | 39 MB +     "public"."container_ident"                                   | 17 MB      | 20 MB        | 37 MB + +I think this is before the JSONB schema change, so won't reflect actual/final. + +Perf: + +           relname        | too_much_seq | case |  rel_size   | seq_scan | idx_scan   +    ----------------------+--------------+------+-------------+----------+----------- +     file_rev_url         |          -18 | OK   |  2089836544 |        2 |        20 +     file_release         |        -4001 | OK   |   263249920 |        2 |      4003 +     container_edit       |      -132140 | OK   |    24608768 |        2 |    132142 +     release_edit         |      -132140 | OK   |  4937408512 |        2 |    132142 +     work_edit            |      -132140 | OK   |  4937351168 |        2 |    132142 +     file_edit            |      -132140 | OK   |   618545152 |        2 |    132142 +     creator_edit         |      -132140 | OK   |   363954176 |        2 |    132142 +     changelog            |      -134456 | OK   |    54919168 |        5 |    134461 +     container_rev        |     -1276670 | OK   |    41680896 |   592993 |   1869663 +     abstracts            |     -1353093 | OK   |   669425664 |        3 |   1353096 +     release_ref          |     -3429019 | OK   | 99223470080 |        3 |   3429022 +     release_contrib      |     -3429026 | OK   | 13406437376 |        3 |   3429029 +     release_rev_abstract |     -3435251 | OK   |    73359360 |        3 |   3435254 +     creator_ident        |     -5360858 | OK   |   266977280 |       11 |   5360869 +     creator_rev          |     -8071846 | OK   |   388734976 |   257781 |   8329627 +     file_ident           |    -13478348 | OK   |   524746752 |        7 |  13478355 +     file_rev             |    -30368193 | OK   |   867401728 |        4 |  30368197 +     container_ident      |    -47140767 | OK   |    18219008 |        9 |  47140776 +     release_ident        |    -61021070 | OK   |  3620864000 |       24 |  61021094 +     work_rev             |   -106056502 | OK   |  2349146112 |        1 | 106056503 +     work_ident           |   -106320780 | OK   |  3620954112 |        6 | 106320786 +     editgroup            |   -117742455 | OK   |    62398464 |        2 | 117742457 +     release_rev          |   -563415274 | OK   | 32455860224 |        8 | 563415282 + +Thought I had resoled those scans... hrm. Pretty sure (but not 100%) that I +cleared the counters. + +Dumps! + +    time psql fatcat_prod < ../extra/quick_dump.sql +    real    4m5.729s +    user    0m0.056s +    sys     0m0.016s +    => while other stuff running + +    time ./ident_table_snapshot.sh +    => seems very dominated by tar/xz compression time; should switch back to gzip + +    DATABASE_URL=$DATABASE_URL time ./ident_table_snapshot.sh /tmp +    614.80user 31.74system 13:23.51elapsed 80%CPU (0avgtext+0avgdata 10124maxresident)k +    8311456inputs+9761200outputs (2major+3792minor)pagefaults 0swaps +    => 13.5 minutes +    => fatcat_idents.2018-09-13.215147.r874951.tar.gz: 4.7GB +    => uncompressed, 8.3GB total + +    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release -j8 -q | pv -l | gzip > fatcat_release_dump.json.gz +    => can not run with more than -j8, it seems (postgres client limit) +    => TPS: 12522 (!) +    => 101k 0:00:34 [2.99k/s] +    => estimate 6.5 hours + +    # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers +    cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz +    => TPS: 29605 (!) +    => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?) +    => 341k 0:01:07 [3.48k/s] (ah, must have been in disk cache) +    => 4.12M 0:16:11 (averaging 4.2k/s; estimate 4 hours total) +    => fatcat_release_dump_expanded.json.gz: 3.4G with 6.76M lines, so expecting 35GB or so + +    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- +    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw  +     14   3  73   8   0   2|  43M   21M| 143k  326B|   0     0 |  64k  113k +     18   4  72   5   0   2|  55M 4096k| 246k  326B|   0     0 |  84k  157k +     17   4  72   6   0   2|  53M 9964k| 428k  908B|   0     0 |  84k  155k +     14   3  73   8   0   1|  46M   20M| 143k   87k|   0     0 |  71k  132k +     18   3  72   5   0   2|  52M 8880k| 318k   95k|   0     0 |  82k  149k + +    => fatcat-export using about 200% CPU in a single thread; other 8 threads at about 25% each +        => might be interesting to profile... too many copies? switch to tsv library? +    => seems like postgres only about 50% utilized + +Benchmarks! + +    # stub entity +    $ wrk -t8 -c100 -d30s http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai +    Running 30s test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai +      8 threads and 100 connections +      Thread Stats   Avg      Stdev     Max   +/- Stdev +        Latency     7.44ms    5.26ms  67.53ms   60.07% +        Req/Sec     1.71k   113.55     2.07k    68.79% +      408986 requests in 30.01s, 117.01MB read +    Requests/sec:  13627.62 +    Transfer/sec:      3.90MB +    => TPS: 39958 +    => lots of "BEGIN" and "SELECT 1", implying new connections? hrm. +    => should run longer for more stable results + +    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai + +    Running 2m test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai +      12 threads and 400 connections +      Thread Stats   Avg      Stdev     Max   +/- Stdev +        Latency    25.75ms   17.02ms 263.74ms   65.11% +        Req/Sec     1.05k   377.38     1.82k    74.81% +      Latency Distribution +         50%   25.40ms +         75%   26.56ms +         90%   49.57ms +         99%   76.47ms +      1128253 requests in 2.00m, 322.80MB read +      Socket errors: connect 0, read 10, write 0, timeout 0 +    Requests/sec:   9396.03 +    Transfer/sec:      2.69MB + +    => single 470% fatcatd thread; diesel? +    => 50% systemd-journald + +    # rich entity +    $ wrk -t8 -c100 -d30s https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container +    Running 30s test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container +      8 threads and 100 connections +      Thread Stats   Avg      Stdev     Max   +/- Stdev +        Latency    33.35ms   14.01ms 152.05ms   79.98% +        Req/Sec   363.14     30.36   464.00     76.48% +      86719 requests in 30.04s, 234.46MB read +    Requests/sec:   2886.55 +    Transfer/sec:      7.80MB + +    wrk -t12 -c400 -d2m --latency https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container + +    Running 2m test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container +      12 threads and 400 connections +      Thread Stats   Avg      Stdev     Max   +/- Stdev +        Latency   149.19ms  125.30ms   1.93s    95.06% +        Req/Sec   237.89     34.60   650.00     73.54% +      Latency Distribution +         50%  129.86ms +         75%  134.14ms +         90%  210.42ms +         99%  904.26ms +      340625 requests in 2.00m, 0.90GB read +      Socket errors: connect 0, read 0, write 0, timeout 303 +    Requests/sec:   2836.17 +    Transfer/sec:      7.67MB + +    => TPS: 9090 +    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- +     usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw  +      4   0  93   3   0   0|2401k   13M|   0     0 |2317B 4812B|4274  3824  +     22   6  68   0   0   3|4096B    0 |  57k  956B|4096B    0 | 135k  255k +     21   7  68   0   0   3|4096B    0 |  61k   66B|4096B    0 | 137k  259k +     21   7  68   0   0   3|  16k 8192B| 179k  636B|  16k    0 | 134k  252k +     21   7  68   0   0   3|   0     0 |  53k   66B|   0     0 | 135k  255k +     22   6  68   0   0   3|   0    17M|  51k  326B|   0     0 | 137k  261k +     22   6  68   0   0   3|   0     0 |  82k  903B|   0     0 | 136k  257k + +    # static (404) +    wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/asdf +    => full 2600+% CPU in a single fatcat process (?) +    => lots of load in wrk also +    => system at a whole basically 100% CPU on all cores, though mostly kernel + +    ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- +    usr sys idl wai hiq siq| read  writ| recv  send|  in   out | int   csw  +      4   0  93   3   0   0|2399k   13M|   0     0 |2315B 4808B|4359  3976  +     26  66   2   0   0   6|   0     0 |4191k  956B|   0     0 | 465k  536k +     26  67   1   0   0   6|   0     0 | 511k   66B|   0     0 | 466k  543k +     27  64   3   0   0   6|   0     0 |2648k  318B|   0     0 | 481k  555k +     25  67   2   0   0   6|   0     0 |1909k  768B|   0     0 | 481k  550k + +    Running 2m test @ http://localhost:9411/v0/asdf +      12 threads and 400 connections +      Thread Stats   Avg      Stdev     Max   +/- Stdev +        Latency     1.40ms    1.65ms 219.48ms   93.33% +        Req/Sec    17.71k     8.97k   65.69k    63.84% +      Latency Distribution +         50%    1.13ms +         75%    1.89ms +         90%    2.71ms +         99%    4.76ms +      21145472 requests in 2.00m, 1.61GB read +      Socket errors: connect 0, read 12, write 0, timeout 0 +      Non-2xx or 3xx responses: 21145472 +    Requests/sec: 176071.79 +    Transfer/sec:     13.77MB + +Noting pretty non-trival traffic to 207.241.229.230 and +es-worker-02.us.archive.org (207.241.229.246)... probably because this is a +datanode? + +## Misc + +Postgres really wanted to log: <https://wiki.postgresql.org/wiki/Per-user_log_settings> + +    ALTER DATABASE fatcat_prod SET log_statement = 'ddl'; +    ALTER USER fatcat SET log_statement = 'ddl'; + + +## Later Imports + +    zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - +    => only ~40 TPS +    => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index? + +           relname        | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan  +    ----------------------+--------------+----------------+-------------+----------+---------- +     file_rev             |     -5343173 | OK             |   867483648 |      693 |  5343866 + +    EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; + +Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success. + +    http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found +    http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found +        => but no releases? + +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze = true; +ALTER USER fatcat SET log_statement = 'all'; +ALTER USER fatcat SET auto_explain.log_min_duration = 0; +ALTER USER fatcat SET auto_explain.log_analyze = true;; +sudo service fatcat-api restart + + +2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG:  duration: 3028.143 ms  plan: +        Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3 +        Limit  (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1) +          ->  Nested Loop  (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1) +                ->  Seq Scan on file_rev  (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1) +                      Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text)) +                      Rows Removed by Filter: 6607786 +                ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..1.92 rows=1 width=49) (never executed) +                      Index Cond: (rev_id = file_rev.id) +                      Filter: (is_live AND (redirect_id IS NULL)) + + +fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; +                                                        QUERY PLAN                                                         +-------------------------------------------------------------------------------------------------------------------------- + Limit  (cost=0.99..5.04 rows=1 width=454) +   ->  Nested Loop  (cost=0.99..5.04 rows=1 width=454) +         ->  Index Scan using file_rev_sha1_idx on file_rev  (cost=0.56..2.58 rows=1 width=405) +               Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar)) +         ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..2.45 rows=1 width=49) +               Index Cond: (rev_id = file_rev.id) +               Filter: (is_live AND (redirect_id IS NULL)) + +After all this debugging, asked on IRC and learned that I really shouldn't ever +use CHAR() (or even VARCHAR()). Will update the schema. + +## Investigating Missing Rows + +    fatcat_prod=# select count(*) from release_ident; => 53028167 + +Thought I had more than that previously? Table sizes were also bigger. Hrm. + diff --git a/notes/bootstrap/import_timing_20180920.txt b/notes/bootstrap/import_timing_20180920.txt new file mode 100644 index 00000000..a57ffd77 --- /dev/null +++ b/notes/bootstrap/import_timing_20180920.txt @@ -0,0 +1,96 @@ + +This was on fatcat-prod-vm (2TB disk). + +    time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv + +    Processed 53300 lines, inserted 53283, updated 0. +    real    0m32.463s +    user    0m8.716s +    sys     0m0.284s + +    time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +    Processed 48900 lines, inserted 48731, updated 0. <= these numbers times 80x +    100% 80:0=0s + +    real    10m20.598s +    user    26m16.544s +    sys     1m40.284s + +    time xzcat /srv/fatcat/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + +    Processed 4679900 lines, inserted 3755867, updated 0. +    107730.08user 4110.22system 16:31:25elapsed 188%CPU (0avgtext+0avgdata 447496maxresident)k +    77644160inputs+361948352outputs (105major+49094767minor)pagefaults 0swaps + +    => 16.5 hours, faster! + +    select count(id) from release_ident; => 75106713 + +    kernel/system crashed after first file import (!), so don't have numbers from that. + +Table sizes at this point: + +    select count(id) from file_ident; => 6334606 + +    Size:  389.25G + +                          table_name                          | table_size | indexes_size | total_size  +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref"                                       | 170 GB     | 47 GB        | 217 GB + "public"."release_rev"                                       | 44 GB      | 21 GB        | 65 GB + "public"."release_contrib"                                   | 19 GB      | 20 GB        | 39 GB + "public"."release_edit"                                      | 6671 MB    | 6505 MB      | 13 GB + "public"."work_edit"                                         | 6671 MB    | 6505 MB      | 13 GB + "public"."release_ident"                                     | 4892 MB    | 5875 MB      | 11 GB + "public"."work_ident"                                        | 4892 MB    | 5874 MB      | 11 GB + "public"."work_rev"                                          | 3174 MB    | 2936 MB      | 6109 MB + "public"."file_rev_url"                                      | 3634 MB    | 1456 MB      | 5090 MB + "public"."file_rev"                                          | 792 MB     | 1281 MB      | 2073 MB + "public"."abstracts"                                         | 1665 MB    | 135 MB       | 1800 MB + "public"."file_edit"                                         | 565 MB     | 561 MB       | 1126 MB + "public"."file_release"                                      | 380 MB     | 666 MB       | 1045 MB + "public"."file_ident"                                        | 415 MB     | 496 MB       | 911 MB + "public"."creator_rev"                                       | 371 MB     | 457 MB       | 828 MB + "public"."creator_edit"                                      | 347 MB     | 353 MB       | 700 MB + "public"."creator_ident"                                     | 255 MB     | 305 MB       | 559 MB + "public"."release_rev_abstract"                              | 183 MB     | 237 MB       | 421 MB + "public"."changelog"                                         | 122 MB     | 126 MB       | 247 MB + "public"."editgroup"                                         | 138 MB     | 81 MB        | 219 MB + "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB + "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB + "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB + + +       relname        | too_much_seq | case |   rel_size   | seq_scan | idx_scan  +----------------------+--------------+------+--------------+----------+---------- + release_edit         |            0 | OK   |   6993084416 |        0 |        0 + container_rev        |            0 | OK   |     54124544 |        0 |        0 + creator_ident        |            0 | OK   |    266919936 |        0 |        0 + creator_edit         |            0 | OK   |    363921408 |        0 |        0 + work_rev             |            0 | OK   |   3327262720 |        0 |        0 + creator_rev          |            0 | OK   |    388726784 |        0 |        0 + work_ident           |            0 | OK   |   5128560640 |        0 |        0 + work_edit            |            0 | OK   |   6993108992 |        0 |        0 + container_ident      |            0 | OK   |     25092096 |        0 |        0 + file_edit            |            0 | OK   |    598278144 |        0 |        0 + container_edit       |            0 | OK   |     33857536 |        0 |        0 + changelog            |            0 | OK   |    127549440 |        0 |        0 + abstracts            |        -4714 | OK   |   1706713088 |        0 |     4714 + file_release         |       -13583 | OK   |    401752064 |        0 |    13583 + file_rev_url         |       -13583 | OK   |   3832389632 |        0 |    13583 + editgroup            |       -74109 | OK   |    144277504 |        0 |    74109 + release_contrib      |       -76699 | OK   |  20357849088 |        0 |    76699 + release_ref          |       -76700 | OK   | 183009157120 |        3 |    76703 + release_rev_abstract |       -76939 | OK   |    192102400 |        0 |    76939 + release_rev          |       -77965 | OK   |  47602647040 |        0 |    77965 + file_ident           |      -100089 | OK   |    438255616 |        3 |   100092 + release_ident        |      -152809 | OK   |   5128617984 |        0 |   152809 + file_rev             |      -440780 | OK   |    837705728 |        0 |   440780 +(23 rows) + +Continuing imports: + +    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - +    => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} + diff --git a/notes/bootstrap/import_timing_20180923.txt b/notes/bootstrap/import_timing_20180923.txt new file mode 100644 index 00000000..c7161842 --- /dev/null +++ b/notes/bootstrap/import_timing_20180923.txt @@ -0,0 +1,91 @@ + +    105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k +    71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps + +    real    959m39.521s +    user    1845m10.392s +    sys     70m33.780s + +Did I get the same error again? I'm confused: + +    HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} +    (but not in all threads) + +Yes, ugh, because 50*2500 can be over (it's not just individual large releases, +they come in big batches). + +But: + +    select count(id) from release_ident; => 70006121 + +A lot, though not 72 million like last time, hrm. I'm... going to move ahead I +guess. + +"Processed 4440850 lines, inserted 3509600, updated 0." +    => implies 79029915 records + +    time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - +    Processed 530750 lines, inserted 435239, updated 0. (etc) +    Command exited with non-zero status 1 +    15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k +    127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps + +    real    143m52.681s +    user    252m31.620s +    sys     11m21.608s + +    zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + +    Processed 485200 lines, inserted 244101, updated 168344. (etc) +    22671.44user 1069.84system 3:27:47elapsed 190%CPU (0avgtext+0avgdata 39348maxresident)k +    99672inputs+2497848outputs (109major+422150minor)pagefaults 0swaps + +fatcat-export dump: + +     INFO 2018-09-25T10:01:06Z: fatcat_export: Done reading (70006121 lines), waiting for workers to exit... +      197GiB 4:56:17 [11.4MiB/s] [                                   <=>                                                     ] + +How big is everything? + +    select count(*) from file_release; => 10,485,964 +    select count (distinct target_release_ident_id) from file_release; => 6,486,934 +    select count(id) from release_ident; => 70,006,121 +    select count(*) from container_ident; => 354,793 +    select count(*) from creator_ident; => 3,906,990 +    Size:  324.24G +    /dev/vda1       1.8T  511G  1.2T  31% / + +                          table_name                          | table_size | indexes_size | total_size  +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref"                                       | 121 GB     | 42 GB        | 163 GB + "public"."release_rev"                                       | 33 GB      | 19 GB        | 52 GB + "public"."release_contrib"                                   | 21 GB      | 18 GB        | 39 GB + "public"."release_edit"                                      | 6218 MB    | 6084 MB      | 12 GB + "public"."work_edit"                                         | 6218 MB    | 6084 MB      | 12 GB + "public"."release_ident"                                     | 4560 MB    | 5470 MB      | 10030 MB + "public"."work_ident"                                        | 4560 MB    | 5466 MB      | 10027 MB + "public"."file_rev_url"                                      | 5543 MB    | 2112 MB      | 7655 MB + "public"."work_rev"                                          | 2958 MB    | 2733 MB      | 5691 MB + "public"."file_rev"                                          | 1201 MB    | 1811 MB      | 3012 MB + "public"."abstracts"                                         | 2294 MB    | 184 MB       | 2478 MB + "public"."file_edit"                                         | 931 MB     | 864 MB       | 1795 MB + "public"."file_release"                                      | 605 MB     | 1058 MB      | 1663 MB + "public"."file_ident"                                        | 529 MB     | 633 MB       | 1162 MB + "public"."creator_rev"                                       | 371 MB     | 456 MB       | 826 MB + "public"."creator_edit"                                      | 347 MB     | 352 MB       | 699 MB + "public"."release_rev_abstract"                              | 250 MB     | 325 MB       | 575 MB + "public"."creator_ident"                                     | 255 MB     | 304 MB       | 559 MB + "public"."changelog"                                         | 122 MB     | 127 MB       | 250 MB + "public"."editgroup"                                         | 138 MB     | 82 MB        | 220 MB + "public"."container_rev"                                     | 52 MB      | 38 MB        | 89 MB + "public"."container_edit"                                    | 32 MB      | 30 MB        | 62 MB + "public"."container_ident"                                   | 24 MB      | 28 MB        | 52 MB + +Hrm, bunch of not-accepted containers: + +    select count(*) from container_ident where is_live='f'; => 301507 +    select count(*) from release_ident where is_live='f'; => 0 +    select count(*) from work_ident where is_live='f'; => 0 +    select count(*) from creator_ident where is_live='f'; => 1 (there was a hang earlier) +    select count(*) from file_ident where is_live='f'; => 0 + diff --git a/notes/bootstrap/initial_sources.txt b/notes/bootstrap/initial_sources.txt new file mode 100644 index 00000000..cc22019d --- /dev/null +++ b/notes/bootstrap/initial_sources.txt @@ -0,0 +1,26 @@ + +Probably start with: + +    crossref (including citations) +    arxiv +    medline + +then merge in: + +    dblp +    CORE +    MSAG dump +    VIAF +    archive.org paper/url manifest +    semantic scholar +    oaDOI + +and later: + +    wikidata +    opencitations +    openlibrary + +national libraries: + +    http://www.dnb.de/EN/Service/DigitaleDienste/LinkedData/linkeddata_node.html diff --git a/notes/bootstrap/old_imports.txt b/notes/bootstrap/old_imports.txt new file mode 100644 index 00000000..1233d4a8 --- /dev/null +++ b/notes/bootstrap/old_imports.txt @@ -0,0 +1,20 @@ + +## ORCID + +Directly from compressed tarball; takes about 2 hours in production: + +    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid - + +After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted +tarball: + +    tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json +    time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +Does not work: + +    ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + +    cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid - | 
