From 186c03c2b9e1b08e4298383fc6593d1b2c3a5dd8 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 25 Sep 2018 15:44:39 -0700 Subject: start organizing notes into subdirectories --- notes/alignments.csv | 57 ------ notes/alignments.txt | 22 -- notes/bootstrap/import_timing_20180815.txt | 292 ++++++++++++++++++++++++++ notes/bootstrap/import_timing_20180908.txt | 249 ++++++++++++++++++++++ notes/bootstrap/import_timing_20180910.txt | 319 +++++++++++++++++++++++++++++ notes/bootstrap/import_timing_20180920.txt | 96 +++++++++ notes/bootstrap/import_timing_20180923.txt | 91 ++++++++ notes/bootstrap/initial_sources.txt | 26 +++ notes/bootstrap/old_imports.txt | 20 ++ notes/contrib_types.txt | 14 -- notes/import_timing_20180815.txt | 292 -------------------------- notes/import_timing_20180908.txt | 249 ---------------------- notes/import_timing_20180910.txt | 319 ----------------------------- notes/import_timing_20180920.txt | 96 --------- notes/import_timing_20180923.txt | 39 ---- notes/initial_sources.txt | 26 --- notes/old_imports.txt | 20 -- notes/performance/postgres_performance.txt | 203 ++++++++++++++++++ notes/performance/speed.txt | 82 ++++++++ notes/postgres_performance.txt | 203 ------------------ notes/schema/alignments.csv | 57 ++++++ notes/schema/alignments.txt | 22 ++ notes/schema/contrib_types.txt | 14 ++ notes/schema/work_release_types.txt | 28 +++ notes/speed.txt | 82 -------- notes/work_release_types.txt | 28 --- 26 files changed, 1499 insertions(+), 1447 deletions(-) delete mode 100644 notes/alignments.csv delete mode 100644 notes/alignments.txt create mode 100644 notes/bootstrap/import_timing_20180815.txt create mode 100644 notes/bootstrap/import_timing_20180908.txt create mode 100644 notes/bootstrap/import_timing_20180910.txt create mode 100644 notes/bootstrap/import_timing_20180920.txt create mode 100644 notes/bootstrap/import_timing_20180923.txt create mode 100644 notes/bootstrap/initial_sources.txt create mode 100644 notes/bootstrap/old_imports.txt delete mode 100644 notes/contrib_types.txt delete mode 100644 notes/import_timing_20180815.txt delete mode 100644 notes/import_timing_20180908.txt delete mode 100644 notes/import_timing_20180910.txt delete mode 100644 notes/import_timing_20180920.txt delete mode 100644 notes/import_timing_20180923.txt delete mode 100644 notes/initial_sources.txt delete mode 100644 notes/old_imports.txt create mode 100644 notes/performance/postgres_performance.txt create mode 100644 notes/performance/speed.txt delete mode 100644 notes/postgres_performance.txt create mode 100644 notes/schema/alignments.csv create mode 100644 notes/schema/alignments.txt create mode 100644 notes/schema/contrib_types.txt create mode 100644 notes/schema/work_release_types.txt delete mode 100644 notes/speed.txt delete mode 100644 notes/work_release_types.txt diff --git a/notes/alignments.csv b/notes/alignments.csv deleted file mode 100644 index b8619ddc..00000000 --- a/notes/alignments.csv +++ /dev/null @@ -1,57 +0,0 @@ -fatcat entity,fatcat field,description,Crossref,ORCID,Bibtex,CSL,BIBFRAME,Dublin Core (DCMES),Dublin Core (DCMI),oai_dc (OAI-PMH),resourceSync,Highwire Press (google scholar),MEDLINE (?) -WORK,,,,,,,Work,,,,,, -,work_type,,,,,,,Type,,,,, -RELEASE,,,,,,,Instance,,,,,, -,title,,title,,title,,,Title,,title,,citation_title, -,release_type,,type (???),,object type; howpublished,,,Type,,,,, -,release_status,"NEW; eg, preprint, final",,,,,,,,,,, -,container,,ISSN (via lookup),,ISSN (via lookup),,,Relation (?),,,,citation_issn (normalized), -,release_date,RENAME,published-print or published-online,,"year, month",issued,,Date,Issued (?),,,citation_publication_date, -,volume,,volume,,volume,,,,,,,citation_volume, -,pages,,page,,pages,,,,,,,"citation_firstpage, citation_lastpage", -,issue,,issue,,,,,,,,,citation_issue, -,doi,always lower-case,DOI,,doi,DOI,,Identifier,,,,citation_doi, -,isbn13,,ISBN (converted),,,ISBN,,Identifier,,,,citation_isbn, -,publisher,,publisher,,publisher,,,Publisher,,,,"citation_dissertation_institution, citation_technical_report_institution", -,language,NEW; RFC1766 (ISO 639-1 superset); may transition,,,,,,Language,,,,, -,ref:index,,reference:[index],,,Citation-number,,,,,,, -,ref:key,NEW,reference:key,,,citation-label,,,,,,, -,ref:raw,RENAME,reference:unstructured,,,,,,,,,, -,ref:container_title,NEW,reference:journal-title,,,,,,,,,, -,ref:year,NEW,reference:year,,,,,,,,,, -,ref:title,NEW,,,,,,,,,,, -,ref:locator,"NEW; (aka, page number)",,,,,,,,,,, -,contrib:role,,,,,,Role,,,,,, -,contrib:index,,author[index],,,,,,,,,, -,contrib:raw,RENAME,Author:{given+family},,author,,,,,,,, -,extra:number,tech reports,,,,,,,,,,citation_technical_report_number, -,extra:institution,for tech reports and dissertations,,,,,,,,,,, -,extra:abstract,,,,,,,Description,,,,citation_abstract, -,extra:pmcid,,,,,,,,,,,, -,extra:pmid,,,,,,,,,,,, -,extra:version,"eg, for software",,,,,,,,,,, -,,,,,,,,,,,,, -CREATOR,,,,,,,Agent,Creator / Contributor,,,,, -,display_name,NEW; usually western/latinized ,,.name.credit-name.value (?),,,,,,,,citation_author, -,orcid,,,.path,,,,,,,,, -,given_name,NEW,,.name.given-names.value,,,,,,,,, -,surname,NEW,,.name.family-names.value,,,,,,,,, -,extra:viaf,,,,,,,,,,,, -,extra:alt_names[],,,,,,,,,,,, -,,,,,,,,,,,,, -CONTAINER,,,,,,,N/A,relation.isPartOf,,,,, -,name,,,,,,,,,,,"citation_journal_title, citation_conference_title", -,publisher,,,,,,,,,,,, -,issnl,,,,,,,Identifier (?),,,,citation_issn (normalized), -,abbrev,,,,,,,,,,,citation_journal_abbrev, -,coden,,,,,,,,,,,, -,extra:issn-e,,,,,,,,,,,, -,extra:issn-p,,,,,,,,,,,, -FILE,,,,,,,Item,,,,,, -,release,,,,,,,,,,,, -,size,,,,,,,,,,length,, -,sha256,NEW,,,,,,,,,hash,, -,sha1,,,,,,,,,,hash,, -,md5,,,,,,,,,,hash,, -,url,,,,www_pdf_url,,,,,,href,citation_pdf_url, -,mimetype,NEW,,,,,,Format,,,type,, diff --git a/notes/alignments.txt b/notes/alignments.txt deleted file mode 100644 index e2736268..00000000 --- a/notes/alignments.txt +++ /dev/null @@ -1,22 +0,0 @@ -bibtex -BIBFRAME -schema.org: http://schema.org/CreativeWork -dublin core - -entity specific: -- crossref / release -- orcid / creator -- issn / container - - -## Dublin Core (original/simple) - -Usage guide: - -Quick descriptions of the "original 15" fields: - -## Citation Style Language - -Human-readable specification: - -Specifically, the "variables" and type definitions: diff --git a/notes/bootstrap/import_timing_20180815.txt b/notes/bootstrap/import_timing_20180815.txt new file mode 100644 index 00000000..1206cc41 --- /dev/null +++ b/notes/bootstrap/import_timing_20180815.txt @@ -0,0 +1,292 @@ + +Schema changes since previous imports: +- more fields (identifiers+indexes) +- timestamps +- UUIDs more places +- fixed some crossref import bugs? +- abstracts +- file_urls as table (not single value) +- timestamps +- TEXT -> CHAR in a few places +- removed many work fields + +## Containers + +(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv + +real 1m25.292s +user 0m12.640s +sys 0m0.412s + +## Creators + +time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*) +real 22m2.465s +user 26m41.924s +sys 1m33.844s + +## Releases + +xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt + + 128516.30 user + 3905.14 system + 44:17:05 elapsed + 83% CPU + +Almost 44 hours... I think I remember more like 36 hours last time? Things +slowed down a lot towards the end, many more ORCID cross-references? + +looking in htop, postgres seems to be primary bottleneck. At something like 12 +hours in, had 44 million release_ident rows, which is 1000/second. + +Note: seems like the more frequently `count(*)` is run, the more performant. +Because in-memory? + + 2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG: duration: 42949.549 ms statement: select count(id) from release_ident; + + fatcat_prod=# select count(*) from release_ident; + count + ---------- + 44185608 + (1 row) + + Time: 2753.916 ms (00:02.754) + fatcat_prod=# select count(*) from release_ident; + count + ---------- + 44187937 + (1 row) + + Time: 2711.670 ms (00:02.712) + +As expected, autovacuum very busy. Only ~150 TPS; but that includes batch +writes? 75061172 rows. + +## Files + + time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite + + Done! Inserted 6607075 + + real 2152m28.822s => 36 hours (!) + user 401m46.464s + sys 21m45.724s + + +Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table? + + SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce'; + +Created index: + + CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev); + +... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't +see non-indexed queries. Maybe an ANALYZE does need to happen? + +This being single-threaded is going to be a problem in the future. ~50 million +files would be ~2 weeks. + +## Post-Import Status + + Size: 358.89G (postgres self-reported) + Mem.: 57.10% - 16.85G/49.14G + +Was 184G last time in late June; doubled in size (!). + + bnewbold@wbgrp-svc500$ df -h / + Filesystem Size Used Avail Use% Mounted on + /dev/vda1 858G 529G 286G 65% / + + bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/ + 361G /var/lib/postgresql/ + 83G /srv/datasets/ + 77G /srv/elastic-blah/ + + fatcat_prod=# select count(*) from changelog; => 2,085,067 + + SELECT + table_name, + pg_size_pretty(table_size) AS table_size, + pg_size_pretty(indexes_size) AS indexes_size, + pg_size_pretty(total_size) AS total_size + FROM ( + SELECT + table_name, + pg_table_size(table_name) AS table_size, + pg_indexes_size(table_name) AS indexes_size, + pg_total_relation_size(table_name) AS total_size + FROM ( + SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name + FROM information_schema.tables + ) AS all_tables + ORDER BY total_size DESC + ) AS pretty_sizes; + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref" | 159 GB | 47 GB | 206 GB + "public"."release_rev" | 40 GB | 10 GB | 51 GB + "public"."release_contrib" | 19 GB | 20 GB | 39 GB + "public"."release_ident" | 5797 MB | 6597 MB | 12 GB + "public"."work_ident" | 5787 MB | 6394 MB | 12 GB + "public"."release_edit" | 6674 MB | 4646 MB | 11 GB + "public"."work_edit" | 6674 MB | 4646 MB | 11 GB + "public"."work_rev" | 3175 MB | 2939 MB | 6114 MB + "public"."file_rev_url" | 1995 MB | 275 MB | 2270 MB + "public"."abstracts" | 1665 MB | 135 MB | 1800 MB + "public"."file_rev" | 829 MB | 954 MB | 1783 MB + "public"."file_ident" | 498 MB | 532 MB | 1030 MB + "public"."file_release" | 369 MB | 642 MB | 1011 MB + "public"."file_edit" | 591 MB | 410 MB | 1002 MB + "public"."creator_rev" | 337 MB | 318 MB | 655 MB + "public"."creator_ident" | 280 MB | 297 MB | 577 MB + "public"."creator_edit" | 316 MB | 220 MB | 536 MB + "public"."release_rev_abstract" | 183 MB | 84 MB | 267 MB + "public"."changelog" | 123 MB | 125 MB | 249 MB + "public"."editgroup" | 139 MB | 81 MB | 220 MB + "public"."container_rev" | 19 MB | 6912 kB | 26 MB + "public"."container_ident" | 6896 kB | 7016 kB | 14 MB + "public"."container_edit" | 8056 kB | 5240 kB | 13 MB + +In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB. + +For many of these indexes, and the _ident tables, switching from UUID to +BIGSERIAL would half the size. + +## Exports + + time ./fatcat_export.py changelog - | pv -l | wc + + As of: + + 159k 1:17:35 [34.3 /s] + 159,740 lines + 2,427,277,881 chars (bytes; 2.4GB) + + real 77m35.183s + user 15m36.208s + sys 0m31.484s + +Running at about 100/sec; estimate 6 hours for completion. Could shard using +start/end flags, but am not here. + +Running `quick_dump.sql` (identifier tables, in a transaction): + + 251M Aug 19 23:08 fatcat_ident_creators.tsv + 5.9M Aug 19 23:08 fatcat_ident_containers.tsv + 467M Aug 19 23:08 fatcat_ident_files.tsv + 5.2G Aug 19 23:10 fatcat_ident_releases.tsv + 5.2G Aug 19 23:11 fatcat_ident_works.tsv + 12K Aug 19 23:11 . + 1.8G Aug 19 23:12 fatcat_abstracts.json + +Work and Release tables in under 2 minutes each; say 5 minutes total. + + time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc + + 172k 1:07:08 [42.7 /s] + 172181 lines + 1,118,166,293 chars (bytes; 1.1 GB) + + real 67m8.340s + user 10m21.988s + sys 0m34.612s + +Running at only 10/sec or so, this would take forever even if sharded. :( + +Both exports/dumps are running in parallel. "Expand" queries might help with speed? + +## Postgres Analysis + +SELECT * +FROM + pg_stat_statements +ORDER BY + total_time DESC LIMIT 5; + +Summary: + + SELECT "creator_ident" by ORCID + 1,295,864 calls + 930,305,208 total time + 717.9 mean time <= this should be less than a ms! + + INSERT INTO release_rev + 75144055 calls + 111470961 total time + 1.483 mean time + + INSERT INTO work_rev + 75,144,055 calls + 82693994 total time + 1.1 mean time + + INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING * + 26,008,280 calls <= why so few? different query depending on number + of rows inserted + 18955782 total time + 0.728 mean time + + SELECT container_ident + 78,4143 calls + 17683156 total time + 22.55 mean time <= why so slow? + + INSERT INTO release_contrib + 15,072,820 calls + + INSERT INTO "release_contrib + + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+----------------+--------------+----------+----------- + file_rev_url | 2391 | Missing Index? | 2091147264 | 2391 | 0 + file_release | -30670 | OK | 386899968 | 2 | 30672 + container_rev | -979948 | OK | 20242432 | 784146 | 1764094 + file_edit | -2206807 | OK | 619896832 | 6 | 2206813 + creator_edit | -2206810 | OK | 331079680 | 11 | 2206821 + work_edit | -2206811 | OK | 6996566016 | 14 | 2206825 + release_edit | -2206811 | OK | 6996582400 | 14 | 2206825 + container_edit | -2206816 | OK | 8216576 | 5 | 2206821 + changelog | -2209659 | OK | 129286144 | 10 | 2209669 + abstracts | -3486466 | OK | 1706237952 | 8 | 3486474 + release_rev_abstract | -4975493 | OK | 191602688 | 42919 | 5018412 + release_ref | -5032717 | OK | 170494861312 | 3 | 5032720 + release_contrib | -5032744 | OK | 20370251776 | 3 | 5032747 + creator_rev | -8400410 | OK | 353583104 | 1296507 | 9696917 + file_ident | -13483224 | OK | 522190848 | 7 | 13483231 + creator_ident | -16686744 | OK | 293625856 | 3 | 16686747 + file_rev | -32405557 | OK | 868515840 | 4 | 32405561 + container_ident | -69162337 | OK | 7028736 | 3 | 69162340 + work_rev | -150288161 | OK | 3328589824 | 1 | 150288162 + editgroup | -162783807 | OK | 146112512 | 9 | 162783816 + release_ident | -165676917 | OK | 6076841984 | 52 | 165676969 + work_ident | -229439828 | OK | 6066814976 | 3 | 229439831 + release_rev | -930140217 | OK | 43360542720 | 9 | 930140226 + +TODO changes: +- don't return all as often; in particular, inserting release_contrib, release_ref +x missing an index somewhere on file_rev_url, release_rev_abstract +x why so many seq_scan on container_rev, creator_rev + => running/EXPLAIN same query on psql hits index, not seq_scan + => seemed to be an issue with VALUE params getting sent separately; query + planner only looked at query and wasn't using index on ORCID/ISSN-L because + it didn't know those values were not-NULL? + => adding NOT NULL to query seems to have sped up case of there being a + "hit", but no hit still slow. might need to change indices or something for + the (perhaps common in future) case of DOI lookups with invalid DOIs (eg, + CORE import) + +random DEBUG queries: + + EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + + EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1; + + EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1; + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; diff --git a/notes/bootstrap/import_timing_20180908.txt b/notes/bootstrap/import_timing_20180908.txt new file mode 100644 index 00000000..3091e4fa --- /dev/null +++ b/notes/bootstrap/import_timing_20180908.txt @@ -0,0 +1,249 @@ + +Changes since last time: +- auto-accept flag (potentially no need to UPDATE/vacuum database) +- large CRUD code refactor (but not complete removal of api_server CRUD yet) +- removed "single query insert per entity", but added row-based inserts + +## Laptop Rough Timing + +HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379: +=> note: in this branch, importer is sitll creating editgroups in separate request + + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 0m28.779s + user 0m5.716s + sys 0m0.208s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + + 89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k + 78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps + # 1:46.67 elapsed + + # running again + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 0m29.714s + user 0m6.048s + sys 0m0.212s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + 89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k + 0inputs+377456outputs (0major+583532minor)pagefaults 0swaps + # 1:48.11 elapsed + +MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a + + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 1m8.061s + user 0m7.384s + sys 0m0.280s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + 96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k + 0inputs+389168outputs (0major+586810minor)pagefaults 0swaps + # 3:23.05 elapsed + +AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510 + + time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv + real 1m7.757s + user 0m6.240s + sys 0m0.228s + + cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt + 91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k + 0inputs+384976outputs (0major+580544minor)pagefaults 0swaps + +So, annecdotally, seems like autoaccept didn't do much here (not +vacuum-limited?), but the row inserts were more than a factor of 2x performance +improvement. Great! Could try experimenting with even larger batch sizes... + + +## Production Import + +http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379 + + time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv + + real 0m30.845s + user 0m8.884s + sys 0m0.312s + + + time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + # TPS: 1181 + real 6m54.019s => down from 22m! + user 25m33.980s + sys 1m30.480s + + xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt + # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?) + # parallel at ~80% + # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction + # postgres still does a vacuum to analyze; good! + # ~600-1000 TPS near start (large variance) + # left to run overnight... + # slowed down to ~50-80 TPS about 10 hours later + # lots of IOWAIT + # only at 40309165 rows in the morning; this could take a long time + + # PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s + # Size: 153.71G - 1.90M/s | TPS: 77 + # Mem.: 68.70% - 22.60G/49.14G | IO Max: 20448/s + # Swap: 3.80% - 1.88G/50.00G | Read : 10.17M/s - 2603/s + # Load: 11.71 11.80 12.01 | Write: 4.00M/s - 1024/s + + 92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k + 1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps + # 35:45:15 elapsed + + time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite + +## Perf Tweaks + + + + SELECT + relname, + seq_scan - idx_scan AS too_much_seq, + CASE + WHEN + seq_scan - coalesce(idx_scan, 0) > 0 + THEN + 'Missing Index?' + ELSE + 'OK' + END, + pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan + FROM + pg_stat_all_tables + WHERE + schemaname = 'public' + AND pg_relation_size(relname::regclass) > 80000 + ORDER BY + too_much_seq DESC; + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+------+--------------+----------+----------- + file_rev_url | -1 | OK | 163323904 | 2 | 3 + file_release | -3 | OK | 24461312 | 2 | 5 + release_edit | -10265 | OK | 6080495616 | 2 | 10267 + container_edit | -10265 | OK | 31170560 | 2 | 10267 + work_edit | -10265 | OK | 6080364544 | 2 | 10267 + file_edit | -10265 | OK | 49111040 | 2 | 10267 + creator_edit | -10265 | OK | 330924032 | 2 | 10267 + changelog | -11692 | OK | 106668032 | 2 | 11694 + release_ref | -374197 | OK | 125437673472 | 3 | 374200 + release_contrib | -374202 | OK | 16835354624 | 3 | 374205 + release_rev_abstract | -374919 | OK | 162250752 | 3 | 374922 + file_ident | -1047172 | OK | 41926656 | 2 | 1047174 + container_rev | -1378943 | OK | 50356224 | 724612 | 2103555 <= + file_rev | -2407647 | OK | 68493312 | 4 | 2407651 + abstracts | -2765610 | OK | 1450901504 | 1 | 2765611 + creator_ident | -7127467 | OK | 242688000 | 2 | 7127469 + creator_rev | -7943007 | OK | 353370112 | 839155 | 8782162 <= + container_ident | -57488245 | OK | 23060480 | 2 | 57488247 + release_ident | -66085389 | OK | 4459159552 | 14 | 66085403 + work_rev | -130613391 | OK | 2892775424 | 1 | 130613392 + work_ident | -130633923 | OK | 4459192320 | 2 | 130633925 + editgroup | -136775970 | OK | 120561664 | 1 | 136775971 + release_rev | -718337215 | OK | 36850507776 | 8 | 718337223 + + +Slowest queries: + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + + SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3 + + SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping... + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + + +DEBUG: (while a file import is running) + +Creator lookup where row exists: + + time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663 + real 0m0.020s + user 0m0.004s + sys 0m0.004s + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + => (1 row) + Time: 0.988 ms + +Creator lookup where row doesn't exist: + + bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X + {"message":"Not found: 0000-0002-8867-166X"} + real 0m1.282s + user 0m0.008s + sys 0m0.000s + Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms) + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + => (0 rows) + Time: 0.810 ms + +------- + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; + +from logs: + + execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + +NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :( + + + exact logs: + SET TIME ZONE 'UTC' + SET CLIENT_ENCODING TO 'UTF8' + SELECT 1 + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 + parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1' + + SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1'; + +CHANGES: +- restart fatcat-api +- restart postgresql +- select pg_stat_reset(); + +seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev. + + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref" | 117 GB | 34 GB | 151 GB + "public"."release_rev" | 34 GB | 9202 MB | 43 GB + "public"."release_contrib" | 16 GB | 17 GB | 33 GB + "public"."work_edit" | 5800 MB | 4033 MB | 9833 MB + "public"."release_edit" | 5800 MB | 4032 MB | 9833 MB + "public"."work_ident" | 4254 MB | 5102 MB | 9356 MB + "public"."release_ident" | 4254 MB | 5095 MB | 9349 MB + "public"."work_rev" | 2759 MB | 2545 MB | 5304 MB + "public"."abstracts" | 1417 MB | 114 MB | 1530 MB + "public"."creator_rev" | 337 MB | 277 MB | 614 MB + "public"."creator_edit" | 316 MB | 221 MB | 536 MB + "public"."creator_ident" | 232 MB | 279 MB | 511 MB + "public"."file_rev_url" | 260 MB | 101 MB | 361 MB + "public"."release_rev_abstract" | 155 MB | 200 MB | 355 MB + "public"."file_rev" | 109 MB | 124 MB | 233 MB + "public"."changelog" | 102 MB | 106 MB | 208 MB + "public"."editgroup" | 116 MB | 69 MB | 184 MB + "public"."file_ident" | 66 MB | 70 MB | 136 MB + "public"."file_edit" | 78 MB | 53 MB | 131 MB + "public"."file_release" | 38 MB | 64 MB | 103 MB + "public"."container_rev" | 48 MB | 26 MB | 74 MB + "public"."container_edit" | 30 MB | 20 MB | 50 MB + "public"."container_ident" | 22 MB | 26 MB | 48 MB + +CHANGES: +- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case. + diff --git a/notes/bootstrap/import_timing_20180910.txt b/notes/bootstrap/import_timing_20180910.txt new file mode 100644 index 00000000..43c76e43 --- /dev/null +++ b/notes/bootstrap/import_timing_20180910.txt @@ -0,0 +1,319 @@ + +Changes since last time: +- removed "WHERE IS NOT NULL" from lookup indexes +- some cleanups +- JSONB + +## QA Timing + +master branch b15fa552e288bec5bbc2b07a3e11bab9235a1e7c + + time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv # --batch-size 100 + real 0m30.249s + user 0m9.364s + sys 0m0.268s + + time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - # --batch-size 100 + => something went sort of wrong? seemed to hang, with no activity. + => may have sent a signal or something accidentally? + + xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt --batch-size 100 + => ran for a few seconds before restarting with increased batch size + => increased batch size seems to be too much for releases? stick with 50 + => still lots of long autovacuum (ANALYZE) + + 67017.73user 2054.96system 25:59:38elapsed 73%CPU (0avgtext+0avgdata 424692maxresident)k + 2085872inputs+219678920outputs (100457major+30837042minor)pagefaults 0swaps + # 25:59:38 elapsed + + time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite # --batch-size 100 + => taking longer to start than I remembered... + + Done! Inserted 6607075 + real 1521m37.856s + user 305m38.444s + sys 18m13.916s + => 25.5 hours + +Disk/table usage: + + Size: 233.95G + + table_name | table_size | indexes_size | total_size + --------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref" | 92 GB | 26 GB | 118 GB + "public"."release_rev" | 30 GB | 14 GB | 45 GB + "public"."release_contrib" | 12 GB | 13 GB | 26 GB + "public"."release_edit" | 4710 MB | 3353 MB | 8062 MB + "public"."work_edit" | 4710 MB | 3352 MB | 8062 MB + "public"."work_ident" | 3454 MB | 4153 MB | 7608 MB + "public"."release_ident" | 3454 MB | 4152 MB | 7606 MB + "public"."work_rev" | 2241 MB | 2076 MB | 4317 MB + "public"."file_rev_url" | 1994 MB | 778 MB | 2772 MB + "public"."file_rev" | 827 MB | 1215 MB | 2042 MB + "public"."file_ident" | 501 MB | 533 MB | 1034 MB + "public"."file_edit" | 590 MB | 411 MB | 1001 MB + "public"."creator_rev" | 371 MB | 455 MB | 826 MB + "public"."abstracts" | 649 MB | 52 MB | 701 MB + "public"."file_release" | 251 MB | 435 MB | 686 MB + "public"."creator_edit" | 347 MB | 242 MB | 590 MB + "public"."creator_ident" | 255 MB | 305 MB | 560 MB + "public"."release_rev_abstract" | 70 MB | 90 MB | 160 MB + "public"."changelog" | 52 MB | 53 MB | 106 MB + "public"."editgroup" | 60 MB | 35 MB | 94 MB + "public"."container_rev" | 40 MB | 27 MB | 67 MB + "public"."container_edit" | 24 MB | 16 MB | 39 MB + "public"."container_ident" | 17 MB | 20 MB | 37 MB + +I think this is before the JSONB schema change, so won't reflect actual/final. + +Perf: + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan + ----------------------+--------------+------+-------------+----------+----------- + file_rev_url | -18 | OK | 2089836544 | 2 | 20 + file_release | -4001 | OK | 263249920 | 2 | 4003 + container_edit | -132140 | OK | 24608768 | 2 | 132142 + release_edit | -132140 | OK | 4937408512 | 2 | 132142 + work_edit | -132140 | OK | 4937351168 | 2 | 132142 + file_edit | -132140 | OK | 618545152 | 2 | 132142 + creator_edit | -132140 | OK | 363954176 | 2 | 132142 + changelog | -134456 | OK | 54919168 | 5 | 134461 + container_rev | -1276670 | OK | 41680896 | 592993 | 1869663 + abstracts | -1353093 | OK | 669425664 | 3 | 1353096 + release_ref | -3429019 | OK | 99223470080 | 3 | 3429022 + release_contrib | -3429026 | OK | 13406437376 | 3 | 3429029 + release_rev_abstract | -3435251 | OK | 73359360 | 3 | 3435254 + creator_ident | -5360858 | OK | 266977280 | 11 | 5360869 + creator_rev | -8071846 | OK | 388734976 | 257781 | 8329627 + file_ident | -13478348 | OK | 524746752 | 7 | 13478355 + file_rev | -30368193 | OK | 867401728 | 4 | 30368197 + container_ident | -47140767 | OK | 18219008 | 9 | 47140776 + release_ident | -61021070 | OK | 3620864000 | 24 | 61021094 + work_rev | -106056502 | OK | 2349146112 | 1 | 106056503 + work_ident | -106320780 | OK | 3620954112 | 6 | 106320786 + editgroup | -117742455 | OK | 62398464 | 2 | 117742457 + release_rev | -563415274 | OK | 32455860224 | 8 | 563415282 + +Thought I had resoled those scans... hrm. Pretty sure (but not 100%) that I +cleared the counters. + +Dumps! + + time psql fatcat_prod < ../extra/quick_dump.sql + real 4m5.729s + user 0m0.056s + sys 0m0.016s + => while other stuff running + + time ./ident_table_snapshot.sh + => seems very dominated by tar/xz compression time; should switch back to gzip + + DATABASE_URL=$DATABASE_URL time ./ident_table_snapshot.sh /tmp + 614.80user 31.74system 13:23.51elapsed 80%CPU (0avgtext+0avgdata 10124maxresident)k + 8311456inputs+9761200outputs (2major+3792minor)pagefaults 0swaps + => 13.5 minutes + => fatcat_idents.2018-09-13.215147.r874951.tar.gz: 4.7GB + => uncompressed, 8.3GB total + + cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release -j8 -q | pv -l | gzip > fatcat_release_dump.json.gz + => can not run with more than -j8, it seems (postgres client limit) + => TPS: 12522 (!) + => 101k 0:00:34 [2.99k/s] + => estimate 6.5 hours + + # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers + cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz + => TPS: 29605 (!) + => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?) + => 341k 0:01:07 [3.48k/s] (ah, must have been in disk cache) + => 4.12M 0:16:11 (averaging 4.2k/s; estimate 4 hours total) + => fatcat_release_dump_expanded.json.gz: 3.4G with 6.76M lines, so expecting 35GB or so + + ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- + usr sys idl wai hiq siq| read writ| recv send| in out | int csw + 14 3 73 8 0 2| 43M 21M| 143k 326B| 0 0 | 64k 113k + 18 4 72 5 0 2| 55M 4096k| 246k 326B| 0 0 | 84k 157k + 17 4 72 6 0 2| 53M 9964k| 428k 908B| 0 0 | 84k 155k + 14 3 73 8 0 1| 46M 20M| 143k 87k| 0 0 | 71k 132k + 18 3 72 5 0 2| 52M 8880k| 318k 95k| 0 0 | 82k 149k + + => fatcat-export using about 200% CPU in a single thread; other 8 threads at about 25% each + => might be interesting to profile... too many copies? switch to tsv library? + => seems like postgres only about 50% utilized + +Benchmarks! + + # stub entity + $ wrk -t8 -c100 -d30s http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai + Running 30s test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai + 8 threads and 100 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 7.44ms 5.26ms 67.53ms 60.07% + Req/Sec 1.71k 113.55 2.07k 68.79% + 408986 requests in 30.01s, 117.01MB read + Requests/sec: 13627.62 + Transfer/sec: 3.90MB + => TPS: 39958 + => lots of "BEGIN" and "SELECT 1", implying new connections? hrm. + => should run longer for more stable results + + wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai + + Running 2m test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai + 12 threads and 400 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 25.75ms 17.02ms 263.74ms 65.11% + Req/Sec 1.05k 377.38 1.82k 74.81% + Latency Distribution + 50% 25.40ms + 75% 26.56ms + 90% 49.57ms + 99% 76.47ms + 1128253 requests in 2.00m, 322.80MB read + Socket errors: connect 0, read 10, write 0, timeout 0 + Requests/sec: 9396.03 + Transfer/sec: 2.69MB + + => single 470% fatcatd thread; diesel? + => 50% systemd-journald + + # rich entity + $ wrk -t8 -c100 -d30s https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container + Running 30s test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container + 8 threads and 100 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 33.35ms 14.01ms 152.05ms 79.98% + Req/Sec 363.14 30.36 464.00 76.48% + 86719 requests in 30.04s, 234.46MB read + Requests/sec: 2886.55 + Transfer/sec: 7.80MB + + wrk -t12 -c400 -d2m --latency https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container + + Running 2m test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container + 12 threads and 400 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 149.19ms 125.30ms 1.93s 95.06% + Req/Sec 237.89 34.60 650.00 73.54% + Latency Distribution + 50% 129.86ms + 75% 134.14ms + 90% 210.42ms + 99% 904.26ms + 340625 requests in 2.00m, 0.90GB read + Socket errors: connect 0, read 0, write 0, timeout 303 + Requests/sec: 2836.17 + Transfer/sec: 7.67MB + + => TPS: 9090 + ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- + usr sys idl wai hiq siq| read writ| recv send| in out | int csw + 4 0 93 3 0 0|2401k 13M| 0 0 |2317B 4812B|4274 3824 + 22 6 68 0 0 3|4096B 0 | 57k 956B|4096B 0 | 135k 255k + 21 7 68 0 0 3|4096B 0 | 61k 66B|4096B 0 | 137k 259k + 21 7 68 0 0 3| 16k 8192B| 179k 636B| 16k 0 | 134k 252k + 21 7 68 0 0 3| 0 0 | 53k 66B| 0 0 | 135k 255k + 22 6 68 0 0 3| 0 17M| 51k 326B| 0 0 | 137k 261k + 22 6 68 0 0 3| 0 0 | 82k 903B| 0 0 | 136k 257k + + # static (404) + wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/asdf + => full 2600+% CPU in a single fatcat process (?) + => lots of load in wrk also + => system at a whole basically 100% CPU on all cores, though mostly kernel + + ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- + usr sys idl wai hiq siq| read writ| recv send| in out | int csw + 4 0 93 3 0 0|2399k 13M| 0 0 |2315B 4808B|4359 3976 + 26 66 2 0 0 6| 0 0 |4191k 956B| 0 0 | 465k 536k + 26 67 1 0 0 6| 0 0 | 511k 66B| 0 0 | 466k 543k + 27 64 3 0 0 6| 0 0 |2648k 318B| 0 0 | 481k 555k + 25 67 2 0 0 6| 0 0 |1909k 768B| 0 0 | 481k 550k + + Running 2m test @ http://localhost:9411/v0/asdf + 12 threads and 400 connections + Thread Stats Avg Stdev Max +/- Stdev + Latency 1.40ms 1.65ms 219.48ms 93.33% + Req/Sec 17.71k 8.97k 65.69k 63.84% + Latency Distribution + 50% 1.13ms + 75% 1.89ms + 90% 2.71ms + 99% 4.76ms + 21145472 requests in 2.00m, 1.61GB read + Socket errors: connect 0, read 12, write 0, timeout 0 + Non-2xx or 3xx responses: 21145472 + Requests/sec: 176071.79 + Transfer/sec: 13.77MB + +Noting pretty non-trival traffic to 207.241.229.230 and +es-worker-02.us.archive.org (207.241.229.246)... probably because this is a +datanode? + +## Misc + +Postgres really wanted to log: + + ALTER DATABASE fatcat_prod SET log_statement = 'ddl'; + ALTER USER fatcat SET log_statement = 'ddl'; + + +## Later Imports + + zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + => only ~40 TPS + => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index? + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan + ----------------------+--------------+----------------+-------------+----------+---------- + file_rev | -5343173 | OK | 867483648 | 693 | 5343866 + + EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; + +Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success. + + http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found + http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found + => but no releases? + +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze = true; +ALTER USER fatcat SET log_statement = 'all'; +ALTER USER fatcat SET auto_explain.log_min_duration = 0; +ALTER USER fatcat SET auto_explain.log_analyze = true;; +sudo service fatcat-api restart + + +2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG: duration: 3028.143 ms plan: + Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3 + Limit (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1) + -> Nested Loop (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1) + -> Seq Scan on file_rev (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1) + Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text)) + Rows Removed by Filter: 6607786 + -> Index Scan using file_ident_rev_idx on file_ident (cost=0.43..1.92 rows=1 width=49) (never executed) + Index Cond: (rev_id = file_rev.id) + Filter: (is_live AND (redirect_id IS NULL)) + + +fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.99..5.04 rows=1 width=454) + -> Nested Loop (cost=0.99..5.04 rows=1 width=454) + -> Index Scan using file_rev_sha1_idx on file_rev (cost=0.56..2.58 rows=1 width=405) + Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar)) + -> Index Scan using file_ident_rev_idx on file_ident (cost=0.43..2.45 rows=1 width=49) + Index Cond: (rev_id = file_rev.id) + Filter: (is_live AND (redirect_id IS NULL)) + +After all this debugging, asked on IRC and learned that I really shouldn't ever +use CHAR() (or even VARCHAR()). Will update the schema. + +## Investigating Missing Rows + + fatcat_prod=# select count(*) from release_ident; => 53028167 + +Thought I had more than that previously? Table sizes were also bigger. Hrm. + diff --git a/notes/bootstrap/import_timing_20180920.txt b/notes/bootstrap/import_timing_20180920.txt new file mode 100644 index 00000000..a57ffd77 --- /dev/null +++ b/notes/bootstrap/import_timing_20180920.txt @@ -0,0 +1,96 @@ + +This was on fatcat-prod-vm (2TB disk). + + time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv + + Processed 53300 lines, inserted 53283, updated 0. + real 0m32.463s + user 0m8.716s + sys 0m0.284s + + time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + + Processed 48900 lines, inserted 48731, updated 0. <= these numbers times 80x + 100% 80:0=0s + + real 10m20.598s + user 26m16.544s + sys 1m40.284s + + time xzcat /srv/fatcat/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 + + Processed 4679900 lines, inserted 3755867, updated 0. + 107730.08user 4110.22system 16:31:25elapsed 188%CPU (0avgtext+0avgdata 447496maxresident)k + 77644160inputs+361948352outputs (105major+49094767minor)pagefaults 0swaps + + => 16.5 hours, faster! + + select count(id) from release_ident; => 75106713 + + kernel/system crashed after first file import (!), so don't have numbers from that. + +Table sizes at this point: + + select count(id) from file_ident; => 6334606 + + Size: 389.25G + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref" | 170 GB | 47 GB | 217 GB + "public"."release_rev" | 44 GB | 21 GB | 65 GB + "public"."release_contrib" | 19 GB | 20 GB | 39 GB + "public"."release_edit" | 6671 MB | 6505 MB | 13 GB + "public"."work_edit" | 6671 MB | 6505 MB | 13 GB + "public"."release_ident" | 4892 MB | 5875 MB | 11 GB + "public"."work_ident" | 4892 MB | 5874 MB | 11 GB + "public"."work_rev" | 3174 MB | 2936 MB | 6109 MB + "public"."file_rev_url" | 3634 MB | 1456 MB | 5090 MB + "public"."file_rev" | 792 MB | 1281 MB | 2073 MB + "public"."abstracts" | 1665 MB | 135 MB | 1800 MB + "public"."file_edit" | 565 MB | 561 MB | 1126 MB + "public"."file_release" | 380 MB | 666 MB | 1045 MB + "public"."file_ident" | 415 MB | 496 MB | 911 MB + "public"."creator_rev" | 371 MB | 457 MB | 828 MB + "public"."creator_edit" | 347 MB | 353 MB | 700 MB + "public"."creator_ident" | 255 MB | 305 MB | 559 MB + "public"."release_rev_abstract" | 183 MB | 237 MB | 421 MB + "public"."changelog" | 122 MB | 126 MB | 247 MB + "public"."editgroup" | 138 MB | 81 MB | 219 MB + "public"."container_rev" | 52 MB | 38 MB | 89 MB + "public"."container_edit" | 32 MB | 30 MB | 62 MB + "public"."container_ident" | 24 MB | 28 MB | 52 MB + + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +----------------------+--------------+------+--------------+----------+---------- + release_edit | 0 | OK | 6993084416 | 0 | 0 + container_rev | 0 | OK | 54124544 | 0 | 0 + creator_ident | 0 | OK | 266919936 | 0 | 0 + creator_edit | 0 | OK | 363921408 | 0 | 0 + work_rev | 0 | OK | 3327262720 | 0 | 0 + creator_rev | 0 | OK | 388726784 | 0 | 0 + work_ident | 0 | OK | 5128560640 | 0 | 0 + work_edit | 0 | OK | 6993108992 | 0 | 0 + container_ident | 0 | OK | 25092096 | 0 | 0 + file_edit | 0 | OK | 598278144 | 0 | 0 + container_edit | 0 | OK | 33857536 | 0 | 0 + changelog | 0 | OK | 127549440 | 0 | 0 + abstracts | -4714 | OK | 1706713088 | 0 | 4714 + file_release | -13583 | OK | 401752064 | 0 | 13583 + file_rev_url | -13583 | OK | 3832389632 | 0 | 13583 + editgroup | -74109 | OK | 144277504 | 0 | 74109 + release_contrib | -76699 | OK | 20357849088 | 0 | 76699 + release_ref | -76700 | OK | 183009157120 | 3 | 76703 + release_rev_abstract | -76939 | OK | 192102400 | 0 | 76939 + release_rev | -77965 | OK | 47602647040 | 0 | 77965 + file_ident | -100089 | OK | 438255616 | 3 | 100092 + release_ident | -152809 | OK | 5128617984 | 0 | 152809 + file_rev | -440780 | OK | 837705728 | 0 | 440780 +(23 rows) + +Continuing imports: + + zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} + diff --git a/notes/bootstrap/import_timing_20180923.txt b/notes/bootstrap/import_timing_20180923.txt new file mode 100644 index 00000000..c7161842 --- /dev/null +++ b/notes/bootstrap/import_timing_20180923.txt @@ -0,0 +1,91 @@ + + 105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k + 71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps + + real 959m39.521s + user 1845m10.392s + sys 70m33.780s + +Did I get the same error again? I'm confused: + + HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} + (but not in all threads) + +Yes, ugh, because 50*2500 can be over (it's not just individual large releases, +they come in big batches). + +But: + + select count(id) from release_ident; => 70006121 + +A lot, though not 72 million like last time, hrm. I'm... going to move ahead I +guess. + +"Processed 4440850 lines, inserted 3509600, updated 0." + => implies 79029915 records + + time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - + Processed 530750 lines, inserted 435239, updated 0. (etc) + Command exited with non-zero status 1 + 15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k + 127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps + + real 143m52.681s + user 252m31.620s + sys 11m21.608s + + zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + + Processed 485200 lines, inserted 244101, updated 168344. (etc) + 22671.44user 1069.84system 3:27:47elapsed 190%CPU (0avgtext+0avgdata 39348maxresident)k + 99672inputs+2497848outputs (109major+422150minor)pagefaults 0swaps + +fatcat-export dump: + + INFO 2018-09-25T10:01:06Z: fatcat_export: Done reading (70006121 lines), waiting for workers to exit... + 197GiB 4:56:17 [11.4MiB/s] [ <=> ] + +How big is everything? + + select count(*) from file_release; => 10,485,964 + select count (distinct target_release_ident_id) from file_release; => 6,486,934 + select count(id) from release_ident; => 70,006,121 + select count(*) from container_ident; => 354,793 + select count(*) from creator_ident; => 3,906,990 + Size: 324.24G + /dev/vda1 1.8T 511G 1.2T 31% / + + table_name | table_size | indexes_size | total_size +--------------------------------------------------------------+------------+--------------+------------ + "public"."release_ref" | 121 GB | 42 GB | 163 GB + "public"."release_rev" | 33 GB | 19 GB | 52 GB + "public"."release_contrib" | 21 GB | 18 GB | 39 GB + "public"."release_edit" | 6218 MB | 6084 MB | 12 GB + "public"."work_edit" | 6218 MB | 6084 MB | 12 GB + "public"."release_ident" | 4560 MB | 5470 MB | 10030 MB + "public"."work_ident" | 4560 MB | 5466 MB | 10027 MB + "public"."file_rev_url" | 5543 MB | 2112 MB | 7655 MB + "public"."work_rev" | 2958 MB | 2733 MB | 5691 MB + "public"."file_rev" | 1201 MB | 1811 MB | 3012 MB + "public"."abstracts" | 2294 MB | 184 MB | 2478 MB + "public"."file_edit" | 931 MB | 864 MB | 1795 MB + "public"."file_release" | 605 MB | 1058 MB | 1663 MB + "public"."file_ident" | 529 MB | 633 MB | 1162 MB + "public"."creator_rev" | 371 MB | 456 MB | 826 MB + "public"."creator_edit" | 347 MB | 352 MB | 699 MB + "public"."release_rev_abstract" | 250 MB | 325 MB | 575 MB + "public"."creator_ident" | 255 MB | 304 MB | 559 MB + "public"."changelog" | 122 MB | 127 MB | 250 MB + "public"."editgroup" | 138 MB | 82 MB | 220 MB + "public"."container_rev" | 52 MB | 38 MB | 89 MB + "public"."container_edit" | 32 MB | 30 MB | 62 MB + "public"."container_ident" | 24 MB | 28 MB | 52 MB + +Hrm, bunch of not-accepted containers: + + select count(*) from container_ident where is_live='f'; => 301507 + select count(*) from release_ident where is_live='f'; => 0 + select count(*) from work_ident where is_live='f'; => 0 + select count(*) from creator_ident where is_live='f'; => 1 (there was a hang earlier) + select count(*) from file_ident where is_live='f'; => 0 + diff --git a/notes/bootstrap/initial_sources.txt b/notes/bootstrap/initial_sources.txt new file mode 100644 index 00000000..cc22019d --- /dev/null +++ b/notes/bootstrap/initial_sources.txt @@ -0,0 +1,26 @@ + +Probably start with: + + crossref (including citations) + arxiv + medline + +then merge in: + + dblp + CORE + MSAG dump + VIAF + archive.org paper/url manifest + semantic scholar + oaDOI + +and later: + + wikidata + opencitations + openlibrary + +national libraries: + + http://www.dnb.de/EN/Service/DigitaleDienste/LinkedData/linkeddata_node.html diff --git a/notes/bootstrap/old_imports.txt b/notes/bootstrap/old_imports.txt new file mode 100644 index 00000000..1233d4a8 --- /dev/null +++ b/notes/bootstrap/old_imports.txt @@ -0,0 +1,20 @@ + +## ORCID + +Directly from compressed tarball; takes about 2 hours in production: + + tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid - + +After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted +tarball: + + tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json + time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - + +Does not work: + + ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json + +Instead: + + cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid - diff --git a/notes/contrib_types.txt b/notes/contrib_types.txt deleted file mode 100644 index 01024b40..00000000 --- a/notes/contrib_types.txt +++ /dev/null @@ -1,14 +0,0 @@ - -See also: - -author -editor -translator -illustrator -interviewer -reviewer - - -container-author -container-editor -reviewed-author diff --git a/notes/import_timing_20180815.txt b/notes/import_timing_20180815.txt deleted file mode 100644 index 1206cc41..00000000 --- a/notes/import_timing_20180815.txt +++ /dev/null @@ -1,292 +0,0 @@ - -Schema changes since previous imports: -- more fields (identifiers+indexes) -- timestamps -- UUIDs more places -- fixed some crossref import bugs? -- abstracts -- file_urls as table (not single value) -- timestamps -- TEXT -> CHAR in a few places -- removed many work fields - -## Containers - -(python)webcrawl@wbgrp-svc500:/srv/fatcat/src/python$ time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv - -real 1m25.292s -user 0m12.640s -sys 0m0.412s - -## Creators - -time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - - -(times very invalid due to hangs; got 3537837 creators, which is most of the way, so *shrug*) -real 22m2.465s -user 26m41.924s -sys 1m33.844s - -## Releases - -xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt - - 128516.30 user - 3905.14 system - 44:17:05 elapsed - 83% CPU - -Almost 44 hours... I think I remember more like 36 hours last time? Things -slowed down a lot towards the end, many more ORCID cross-references? - -looking in htop, postgres seems to be primary bottleneck. At something like 12 -hours in, had 44 million release_ident rows, which is 1000/second. - -Note: seems like the more frequently `count(*)` is run, the more performant. -Because in-memory? - - 2018-08-16 16:54:16.977 UTC [17996] postgres@fatcat_prod LOG: duration: 42949.549 ms statement: select count(id) from release_ident; - - fatcat_prod=# select count(*) from release_ident; - count - ---------- - 44185608 - (1 row) - - Time: 2753.916 ms (00:02.754) - fatcat_prod=# select count(*) from release_ident; - count - ---------- - 44187937 - (1 row) - - Time: 2711.670 ms (00:02.712) - -As expected, autovacuum very busy. Only ~150 TPS; but that includes batch -writes? 75061172 rows. - -## Files - - time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite - - Done! Inserted 6607075 - - real 2152m28.822s => 36 hours (!) - user 401m46.464s - sys 21m45.724s - - -Going pretty slow, < 100 transactions/sec. Lots of SELECTs, which seem slow, on the abstract table? - - SELECT "release_rev_abstract"."id", "release_rev_abstract"."release_rev", "release_rev_abstract"."abstract_sha1", "release_rev_abstract"."mimetype", "release_rev_abstract"."lang", "abstracts"."sha1", "abstracts"."content" FROM ("release_rev_abstract" INNER JOIN "abstracts" ON "release_rev_abstract"."abstract_sha1" = "abstracts"."sha1") WHERE "release_rev_abstract"."release_rev" = 'ffffffc0-4dd2-47ce-a51d-44051f3699ce'; - -Created index: - - CREATE INDEX release_rev_abstract_rev_idx ON release_rev_abstract(release_rev); - -... and things sped way up. Re-ran some crossref imports to EXPLAIN and didn't -see non-indexed queries. Maybe an ANALYZE does need to happen? - -This being single-threaded is going to be a problem in the future. ~50 million -files would be ~2 weeks. - -## Post-Import Status - - Size: 358.89G (postgres self-reported) - Mem.: 57.10% - 16.85G/49.14G - -Was 184G last time in late June; doubled in size (!). - - bnewbold@wbgrp-svc500$ df -h / - Filesystem Size Used Avail Use% Mounted on - /dev/vda1 858G 529G 286G 65% / - - bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ /srv/datasets/ /srv/elastic-blah/ - 361G /var/lib/postgresql/ - 83G /srv/datasets/ - 77G /srv/elastic-blah/ - - fatcat_prod=# select count(*) from changelog; => 2,085,067 - - SELECT - table_name, - pg_size_pretty(table_size) AS table_size, - pg_size_pretty(indexes_size) AS indexes_size, - pg_size_pretty(total_size) AS total_size - FROM ( - SELECT - table_name, - pg_table_size(table_name) AS table_size, - pg_indexes_size(table_name) AS indexes_size, - pg_total_relation_size(table_name) AS total_size - FROM ( - SELECT ('"' || table_schema || '"."' || table_name || '"') AS table_name - FROM information_schema.tables - ) AS all_tables - ORDER BY total_size DESC - ) AS pretty_sizes; - - table_name | table_size | indexes_size | total_size ---------------------------------------------------------------+------------+--------------+------------ - "public"."release_ref" | 159 GB | 47 GB | 206 GB - "public"."release_rev" | 40 GB | 10 GB | 51 GB - "public"."release_contrib" | 19 GB | 20 GB | 39 GB - "public"."release_ident" | 5797 MB | 6597 MB | 12 GB - "public"."work_ident" | 5787 MB | 6394 MB | 12 GB - "public"."release_edit" | 6674 MB | 4646 MB | 11 GB - "public"."work_edit" | 6674 MB | 4646 MB | 11 GB - "public"."work_rev" | 3175 MB | 2939 MB | 6114 MB - "public"."file_rev_url" | 1995 MB | 275 MB | 2270 MB - "public"."abstracts" | 1665 MB | 135 MB | 1800 MB - "public"."file_rev" | 829 MB | 954 MB | 1783 MB - "public"."file_ident" | 498 MB | 532 MB | 1030 MB - "public"."file_release" | 369 MB | 642 MB | 1011 MB - "public"."file_edit" | 591 MB | 410 MB | 1002 MB - "public"."creator_rev" | 337 MB | 318 MB | 655 MB - "public"."creator_ident" | 280 MB | 297 MB | 577 MB - "public"."creator_edit" | 316 MB | 220 MB | 536 MB - "public"."release_rev_abstract" | 183 MB | 84 MB | 267 MB - "public"."changelog" | 123 MB | 125 MB | 249 MB - "public"."editgroup" | 139 MB | 81 MB | 220 MB - "public"."container_rev" | 19 MB | 6912 kB | 26 MB - "public"."container_ident" | 6896 kB | 7016 kB | 14 MB - "public"."container_edit" | 8056 kB | 5240 kB | 13 MB - -In context, the full uncompressed crossref 2018-01-21 dump is about 285 GB. - -For many of these indexes, and the _ident tables, switching from UUID to -BIGSERIAL would half the size. - -## Exports - - time ./fatcat_export.py changelog - | pv -l | wc - - As of: - - 159k 1:17:35 [34.3 /s] - 159,740 lines - 2,427,277,881 chars (bytes; 2.4GB) - - real 77m35.183s - user 15m36.208s - sys 0m31.484s - -Running at about 100/sec; estimate 6 hours for completion. Could shard using -start/end flags, but am not here. - -Running `quick_dump.sql` (identifier tables, in a transaction): - - 251M Aug 19 23:08 fatcat_ident_creators.tsv - 5.9M Aug 19 23:08 fatcat_ident_containers.tsv - 467M Aug 19 23:08 fatcat_ident_files.tsv - 5.2G Aug 19 23:10 fatcat_ident_releases.tsv - 5.2G Aug 19 23:11 fatcat_ident_works.tsv - 12K Aug 19 23:11 . - 1.8G Aug 19 23:12 fatcat_abstracts.json - -Work and Release tables in under 2 minutes each; say 5 minutes total. - - time ./fatcat_export.py releases /tmp/fatcat_ident_releases.tsv - | pv -l | wc - - 172k 1:07:08 [42.7 /s] - 172181 lines - 1,118,166,293 chars (bytes; 1.1 GB) - - real 67m8.340s - user 10m21.988s - sys 0m34.612s - -Running at only 10/sec or so, this would take forever even if sharded. :( - -Both exports/dumps are running in parallel. "Expand" queries might help with speed? - -## Postgres Analysis - -SELECT * -FROM - pg_stat_statements -ORDER BY - total_time DESC LIMIT 5; - -Summary: - - SELECT "creator_ident" by ORCID - 1,295,864 calls - 930,305,208 total time - 717.9 mean time <= this should be less than a ms! - - INSERT INTO release_rev - 75144055 calls - 111470961 total time - 1.483 mean time - - INSERT INTO work_rev - 75,144,055 calls - 82693994 total time - 1.1 mean time - - INSERT INTO release_contrib (creator_ident_id = DEFAULT) RETURNING * - 26,008,280 calls <= why so few? different query depending on number - of rows inserted - 18955782 total time - 0.728 mean time - - SELECT container_ident - 78,4143 calls - 17683156 total time - 22.55 mean time <= why so slow? - - INSERT INTO release_contrib - 15,072,820 calls - - INSERT INTO "release_contrib - - - relname | too_much_seq | case | rel_size | seq_scan | idx_scan -----------------------+--------------+----------------+--------------+----------+----------- - file_rev_url | 2391 | Missing Index? | 2091147264 | 2391 | 0 - file_release | -30670 | OK | 386899968 | 2 | 30672 - container_rev | -979948 | OK | 20242432 | 784146 | 1764094 - file_edit | -2206807 | OK | 619896832 | 6 | 2206813 - creator_edit | -2206810 | OK | 331079680 | 11 | 2206821 - work_edit | -2206811 | OK | 6996566016 | 14 | 2206825 - release_edit | -2206811 | OK | 6996582400 | 14 | 2206825 - container_edit | -2206816 | OK | 8216576 | 5 | 2206821 - changelog | -2209659 | OK | 129286144 | 10 | 2209669 - abstracts | -3486466 | OK | 1706237952 | 8 | 3486474 - release_rev_abstract | -4975493 | OK | 191602688 | 42919 | 5018412 - release_ref | -5032717 | OK | 170494861312 | 3 | 5032720 - release_contrib | -5032744 | OK | 20370251776 | 3 | 5032747 - creator_rev | -8400410 | OK | 353583104 | 1296507 | 9696917 - file_ident | -13483224 | OK | 522190848 | 7 | 13483231 - creator_ident | -16686744 | OK | 293625856 | 3 | 16686747 - file_rev | -32405557 | OK | 868515840 | 4 | 32405561 - container_ident | -69162337 | OK | 7028736 | 3 | 69162340 - work_rev | -150288161 | OK | 3328589824 | 1 | 150288162 - editgroup | -162783807 | OK | 146112512 | 9 | 162783816 - release_ident | -165676917 | OK | 6076841984 | 52 | 165676969 - work_ident | -229439828 | OK | 6066814976 | 3 | 229439831 - release_rev | -930140217 | OK | 43360542720 | 9 | 930140226 - -TODO changes: -- don't return all as often; in particular, inserting release_contrib, release_ref -x missing an index somewhere on file_rev_url, release_rev_abstract -x why so many seq_scan on container_rev, creator_rev - => running/EXPLAIN same query on psql hits index, not seq_scan - => seemed to be an issue with VALUE params getting sent separately; query - planner only looked at query and wasn't using index on ORCID/ISSN-L because - it didn't know those values were not-NULL? - => adding NOT NULL to query seems to have sped up case of there being a - "hit", but no hit still slow. might need to change indices or something for - the (perhaps common in future) case of DOI lookups with invalid DOIs (eg, - CORE import) - -random DEBUG queries: - - EXPLAIN ANALYSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL LIMIT 1; - - EXPLAIN VERBOSE SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_ident"."is_live" = true AND "creator_ident"."redirect_id" IS NULL VALUES ('0000-0002-8867-1669') LIMIT 1; - - EXPLAIN SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = '0001-0782' AND "container_ident"."is_live" = true AND "container_ident"."redirect_id" IS NULL LIMIT 1; - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; diff --git a/notes/import_timing_20180908.txt b/notes/import_timing_20180908.txt deleted file mode 100644 index 3091e4fa..00000000 --- a/notes/import_timing_20180908.txt +++ /dev/null @@ -1,249 +0,0 @@ - -Changes since last time: -- auto-accept flag (potentially no need to UPDATE/vacuum database) -- large CRUD code refactor (but not complete removal of api_server CRUD yet) -- removed "single query insert per entity", but added row-based inserts - -## Laptop Rough Timing - -HTTP-VERB branch 7354899493f6448bed5698ad6ade1dbebcf39379: -=> note: in this branch, importer is sitll creating editgroups in separate request - - time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv - real 0m28.779s - user 0m5.716s - sys 0m0.208s - - cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt - - 89.83user 4.58system 1:46.67elapsed 88%CPU (0avgtext+0avgdata 386184maxresident)k - 78632inputs+385408outputs (1major+579693minor)pagefaults 0swaps - # 1:46.67 elapsed - - # running again - time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv - real 0m29.714s - user 0m6.048s - sys 0m0.212s - - cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt - 89.07user 4.52system 1:48.11elapsed 86%CPU (0avgtext+0avgdata 385940maxresident)k - 0inputs+377456outputs (0major+583532minor)pagefaults 0swaps - # 1:48.11 elapsed - -MASTER branch 0053d133f8ff96aa4dedc1ff7e2754812ddfc79a - - time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv - real 1m8.061s - user 0m7.384s - sys 0m0.280s - - cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt - 96.68user 5.25system 3:23.05elapsed 50%CPU (0avgtext+0avgdata 385516maxresident)k - 0inputs+389168outputs (0major+586810minor)pagefaults 0swaps - # 3:23.05 elapsed - -AUTO-ACCEPT branch 8cccbcdef11e7ddc761ec494cb894a8d49a0d510 - - time ./fatcat_import.py import-issn /home/bnewbold/code/oa-journals-analysis/upload-2018-04-05/journal_extra_metadata.csv - real 1m7.757s - user 0m6.240s - sys 0m0.228s - - cat /data/crossref/crossref-works.2018-01-21.badsample_100k.json | time parallel -j4 --round-robin --pipe ./fatcat_import.py import-crossref - /data/issn/20180216.ISSN-to-ISSN-L.txt - 91.73user 4.65system 3:26.61elapsed 46%CPU (0avgtext+0avgdata 385512maxresident)k - 0inputs+384976outputs (0major+580544minor)pagefaults 0swaps - -So, annecdotally, seems like autoaccept didn't do much here (not -vacuum-limited?), but the row inserts were more than a factor of 2x performance -improvement. Great! Could try experimenting with even larger batch sizes... - - -## Production Import - -http-verb branch 7354899493f6448bed5698ad6ade1dbebcf39379 - - time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv - - real 0m30.845s - user 0m8.884s - sys 0m0.312s - - - time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - - # TPS: 1181 - real 6m54.019s => down from 22m! - user 25m33.980s - sys 1m30.480s - - xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt - # fatcatd at 200% CPU (2 full cores); many PIDs/workers, but only one so busy (must be diesel/db driver?) - # parallel at ~80% - # postgres pretty busy; looks like doing PARSE on ever request? some idle in transaction - # postgres still does a vacuum to analyze; good! - # ~600-1000 TPS near start (large variance) - # left to run overnight... - # slowed down to ~50-80 TPS about 10 hours later - # lots of IOWAIT - # only at 40309165 rows in the morning; this could take a long time - - # PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s - # Size: 153.71G - 1.90M/s | TPS: 77 - # Mem.: 68.70% - 22.60G/49.14G | IO Max: 20448/s - # Swap: 3.80% - 1.88G/50.00G | Read : 10.17M/s - 2603/s - # Load: 11.71 11.80 12.01 | Write: 4.00M/s - 1024/s - - 92530.17user 2891.76system 35:45:15elapsed 74%CPU (0avgtext+0avgdata 463520maxresident)k - 1093736inputs+302588448outputs (52568major+36405225minor)pagefaults 0swaps - # 35:45:15 elapsed - - time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite - -## Perf Tweaks - - - - SELECT - relname, - seq_scan - idx_scan AS too_much_seq, - CASE - WHEN - seq_scan - coalesce(idx_scan, 0) > 0 - THEN - 'Missing Index?' - ELSE - 'OK' - END, - pg_relation_size(relname::regclass) AS rel_size, seq_scan, idx_scan - FROM - pg_stat_all_tables - WHERE - schemaname = 'public' - AND pg_relation_size(relname::regclass) > 80000 - ORDER BY - too_much_seq DESC; - - relname | too_much_seq | case | rel_size | seq_scan | idx_scan -----------------------+--------------+------+--------------+----------+----------- - file_rev_url | -1 | OK | 163323904 | 2 | 3 - file_release | -3 | OK | 24461312 | 2 | 5 - release_edit | -10265 | OK | 6080495616 | 2 | 10267 - container_edit | -10265 | OK | 31170560 | 2 | 10267 - work_edit | -10265 | OK | 6080364544 | 2 | 10267 - file_edit | -10265 | OK | 49111040 | 2 | 10267 - creator_edit | -10265 | OK | 330924032 | 2 | 10267 - changelog | -11692 | OK | 106668032 | 2 | 11694 - release_ref | -374197 | OK | 125437673472 | 3 | 374200 - release_contrib | -374202 | OK | 16835354624 | 3 | 374205 - release_rev_abstract | -374919 | OK | 162250752 | 3 | 374922 - file_ident | -1047172 | OK | 41926656 | 2 | 1047174 - container_rev | -1378943 | OK | 50356224 | 724612 | 2103555 <= - file_rev | -2407647 | OK | 68493312 | 4 | 2407651 - abstracts | -2765610 | OK | 1450901504 | 1 | 2765611 - creator_ident | -7127467 | OK | 242688000 | 2 | 7127469 - creator_rev | -7943007 | OK | 353370112 | 839155 | 8782162 <= - container_ident | -57488245 | OK | 23060480 | 2 | 57488247 - release_ident | -66085389 | OK | 4459159552 | 14 | 66085403 - work_rev | -130613391 | OK | 2892775424 | 1 | 130613392 - work_ident | -130633923 | OK | 4459192320 | 2 | 130633925 - editgroup | -136775970 | OK | 120561664 | 1 | 136775971 - release_rev | -718337215 | OK | 36850507776 | 8 | 718337223 - - -Slowest queries: - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 - - SELECT "container_ident"."id", "container_ident"."is_live", "container_ident"."rev_id", "container_ident"."redirect_id", "container_rev"."id", "container_rev"."extra_json", "container_rev"."name", "container_rev"."publisher", "container_rev"."issnl", "container_rev"."wikidata_qid", "container_rev"."abbrev", "container_rev"."coden" FROM ("container_ident" INNER JOIN "container_rev" ON "container_ident"."rev_id" = "container_rev"."id") WHERE "container_rev"."issnl" = $1 AND "container_rev"."issnl" IS NOT NULL AND "container_ident"."is_live" = $2 AND "container_ident"."redirect_id" IS NULL LIMIT $3 - - SELECT $2 FROM ONLY "public"."release_rev" x WHERE "id" OPERATOR(pg_catalog.=) $1 FOR KEY SHARE OF x:...skipping... - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 - - -DEBUG: (while a file import is running) - -Creator lookup where row exists: - - time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-1663 - real 0m0.020s - user 0m0.004s - sys 0m0.004s - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; - => (1 row) - Time: 0.988 ms - -Creator lookup where row doesn't exist: - - bnewbold@wbgrp-svc500$ time curl localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X - {"message":"Not found: 0000-0002-8867-166X"} - real 0m1.282s - user 0m0.008s - sys 0m0.000s - Sep 10 21:50:49 wbgrp-svc500.us.archive.org fatcat-api[25327]: Sep 10 21:50:49.231 INFO GET http://localhost:9411/v0/creator/lookup?orcid=0000-0002-8867-166X 404 Not Found (1282 ms) - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; - => (0 rows) - Time: 0.810 ms - -------- - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-1663' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT 1; - -from logs: - - execute __diesel_stmt_5: SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 - -NOTE: have been doing *full* postgres logs this whole time! probably a ton of disk churn. :( - - - exact logs: - SET TIME ZONE 'UTC' - SET CLIENT_ENCODING TO 'UTF8' - SELECT 1 - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = $1 AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = $2 AND "creator_ident"."redirect_id" IS NULL LIMIT $3 - parameters: $1 = '0000-0002-8867-166X', $2 = 't', $3 = '1' - - SELECT "creator_ident"."id", "creator_ident"."is_live", "creator_ident"."rev_id", "creator_ident"."redirect_id", "creator_rev"."id", "creator_rev"."extra_json", "creator_rev"."display_name", "creator_rev"."given_name", "creator_rev"."surname", "creator_rev"."orcid", "creator_rev"."wikidata_qid" FROM ("creator_ident" INNER JOIN "creator_rev" ON "creator_ident"."rev_id" = "creator_rev"."id") WHERE "creator_rev"."orcid" = '0000-0002-8867-166X' AND "creator_rev"."orcid" IS NOT NULL AND "creator_ident"."is_live" = 't' AND "creator_ident"."redirect_id" IS NULL LIMIT '1'; - -CHANGES: -- restart fatcat-api -- restart postgresql -- select pg_stat_reset(); - -seems like the fetch involves a index scans on creator_rev and creator_ident, *and* a scan on creator_rev. - - - table_name | table_size | indexes_size | total_size ---------------------------------------------------------------+------------+--------------+------------ - "public"."release_ref" | 117 GB | 34 GB | 151 GB - "public"."release_rev" | 34 GB | 9202 MB | 43 GB - "public"."release_contrib" | 16 GB | 17 GB | 33 GB - "public"."work_edit" | 5800 MB | 4033 MB | 9833 MB - "public"."release_edit" | 5800 MB | 4032 MB | 9833 MB - "public"."work_ident" | 4254 MB | 5102 MB | 9356 MB - "public"."release_ident" | 4254 MB | 5095 MB | 9349 MB - "public"."work_rev" | 2759 MB | 2545 MB | 5304 MB - "public"."abstracts" | 1417 MB | 114 MB | 1530 MB - "public"."creator_rev" | 337 MB | 277 MB | 614 MB - "public"."creator_edit" | 316 MB | 221 MB | 536 MB - "public"."creator_ident" | 232 MB | 279 MB | 511 MB - "public"."file_rev_url" | 260 MB | 101 MB | 361 MB - "public"."release_rev_abstract" | 155 MB | 200 MB | 355 MB - "public"."file_rev" | 109 MB | 124 MB | 233 MB - "public"."changelog" | 102 MB | 106 MB | 208 MB - "public"."editgroup" | 116 MB | 69 MB | 184 MB - "public"."file_ident" | 66 MB | 70 MB | 136 MB - "public"."file_edit" | 78 MB | 53 MB | 131 MB - "public"."file_release" | 38 MB | 64 MB | 103 MB - "public"."container_rev" | 48 MB | 26 MB | 74 MB - "public"."container_edit" | 30 MB | 20 MB | 50 MB - "public"."container_ident" | 22 MB | 26 MB | 48 MB - -CHANGES: -- remove "IS NOT NULL" from creator_rev, that seemed to be a significant speedup for the "row not found" case. - diff --git a/notes/import_timing_20180910.txt b/notes/import_timing_20180910.txt deleted file mode 100644 index 43c76e43..00000000 --- a/notes/import_timing_20180910.txt +++ /dev/null @@ -1,319 +0,0 @@ - -Changes since last time: -- removed "WHERE IS NOT NULL" from lookup indexes -- some cleanups -- JSONB - -## QA Timing - -master branch b15fa552e288bec5bbc2b07a3e11bab9235a1e7c - - time ./fatcat_import.py import-issn /srv/datasets/journal_extra_metadata.csv # --batch-size 100 - real 0m30.249s - user 0m9.364s - sys 0m0.268s - - time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - # --batch-size 100 - => something went sort of wrong? seemed to hang, with no activity. - => may have sent a signal or something accidentally? - - xzcat /srv/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/datasets/20180216.ISSN-to-ISSN-L.txt --batch-size 100 - => ran for a few seconds before restarting with increased batch size - => increased batch size seems to be too much for releases? stick with 50 - => still lots of long autovacuum (ANALYZE) - - 67017.73user 2054.96system 25:59:38elapsed 73%CPU (0avgtext+0avgdata 424692maxresident)k - 2085872inputs+219678920outputs (100457major+30837042minor)pagefaults 0swaps - # 25:59:38 elapsed - - time ./fatcat_import.py import-manifest /srv/datasets/idents_files_urls.sqlite # --batch-size 100 - => taking longer to start than I remembered... - - Done! Inserted 6607075 - real 1521m37.856s - user 305m38.444s - sys 18m13.916s - => 25.5 hours - -Disk/table usage: - - Size: 233.95G - - table_name | table_size | indexes_size | total_size - --------------------------------------------------------------+------------+--------------+------------ - "public"."release_ref" | 92 GB | 26 GB | 118 GB - "public"."release_rev" | 30 GB | 14 GB | 45 GB - "public"."release_contrib" | 12 GB | 13 GB | 26 GB - "public"."release_edit" | 4710 MB | 3353 MB | 8062 MB - "public"."work_edit" | 4710 MB | 3352 MB | 8062 MB - "public"."work_ident" | 3454 MB | 4153 MB | 7608 MB - "public"."release_ident" | 3454 MB | 4152 MB | 7606 MB - "public"."work_rev" | 2241 MB | 2076 MB | 4317 MB - "public"."file_rev_url" | 1994 MB | 778 MB | 2772 MB - "public"."file_rev" | 827 MB | 1215 MB | 2042 MB - "public"."file_ident" | 501 MB | 533 MB | 1034 MB - "public"."file_edit" | 590 MB | 411 MB | 1001 MB - "public"."creator_rev" | 371 MB | 455 MB | 826 MB - "public"."abstracts" | 649 MB | 52 MB | 701 MB - "public"."file_release" | 251 MB | 435 MB | 686 MB - "public"."creator_edit" | 347 MB | 242 MB | 590 MB - "public"."creator_ident" | 255 MB | 305 MB | 560 MB - "public"."release_rev_abstract" | 70 MB | 90 MB | 160 MB - "public"."changelog" | 52 MB | 53 MB | 106 MB - "public"."editgroup" | 60 MB | 35 MB | 94 MB - "public"."container_rev" | 40 MB | 27 MB | 67 MB - "public"."container_edit" | 24 MB | 16 MB | 39 MB - "public"."container_ident" | 17 MB | 20 MB | 37 MB - -I think this is before the JSONB schema change, so won't reflect actual/final. - -Perf: - - relname | too_much_seq | case | rel_size | seq_scan | idx_scan - ----------------------+--------------+------+-------------+----------+----------- - file_rev_url | -18 | OK | 2089836544 | 2 | 20 - file_release | -4001 | OK | 263249920 | 2 | 4003 - container_edit | -132140 | OK | 24608768 | 2 | 132142 - release_edit | -132140 | OK | 4937408512 | 2 | 132142 - work_edit | -132140 | OK | 4937351168 | 2 | 132142 - file_edit | -132140 | OK | 618545152 | 2 | 132142 - creator_edit | -132140 | OK | 363954176 | 2 | 132142 - changelog | -134456 | OK | 54919168 | 5 | 134461 - container_rev | -1276670 | OK | 41680896 | 592993 | 1869663 - abstracts | -1353093 | OK | 669425664 | 3 | 1353096 - release_ref | -3429019 | OK | 99223470080 | 3 | 3429022 - release_contrib | -3429026 | OK | 13406437376 | 3 | 3429029 - release_rev_abstract | -3435251 | OK | 73359360 | 3 | 3435254 - creator_ident | -5360858 | OK | 266977280 | 11 | 5360869 - creator_rev | -8071846 | OK | 388734976 | 257781 | 8329627 - file_ident | -13478348 | OK | 524746752 | 7 | 13478355 - file_rev | -30368193 | OK | 867401728 | 4 | 30368197 - container_ident | -47140767 | OK | 18219008 | 9 | 47140776 - release_ident | -61021070 | OK | 3620864000 | 24 | 61021094 - work_rev | -106056502 | OK | 2349146112 | 1 | 106056503 - work_ident | -106320780 | OK | 3620954112 | 6 | 106320786 - editgroup | -117742455 | OK | 62398464 | 2 | 117742457 - release_rev | -563415274 | OK | 32455860224 | 8 | 563415282 - -Thought I had resoled those scans... hrm. Pretty sure (but not 100%) that I -cleared the counters. - -Dumps! - - time psql fatcat_prod < ../extra/quick_dump.sql - real 4m5.729s - user 0m0.056s - sys 0m0.016s - => while other stuff running - - time ./ident_table_snapshot.sh - => seems very dominated by tar/xz compression time; should switch back to gzip - - DATABASE_URL=$DATABASE_URL time ./ident_table_snapshot.sh /tmp - 614.80user 31.74system 13:23.51elapsed 80%CPU (0avgtext+0avgdata 10124maxresident)k - 8311456inputs+9761200outputs (2major+3792minor)pagefaults 0swaps - => 13.5 minutes - => fatcat_idents.2018-09-13.215147.r874951.tar.gz: 4.7GB - => uncompressed, 8.3GB total - - cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release -j8 -q | pv -l | gzip > fatcat_release_dump.json.gz - => can not run with more than -j8, it seems (postgres client limit) - => TPS: 12522 (!) - => 101k 0:00:34 [2.99k/s] - => estimate 6.5 hours - - # NOTE AFTER THE FACT: had "contaner" typo in the below, so wasn't expanding containers - cat ../extra/sql_dumps/fatcat_ident_releases.tsv | time ./target/release/fatcat-export release --expand files,contaner -j8 -q | pv -l | gzip > fatcat_release_dump_expanded.json.gz - => TPS: 29605 (!) - => 183k 0:00:35 [ 5.8k/s] (how is this faster than the above? other disk stuff finished?) - => 341k 0:01:07 [3.48k/s] (ah, must have been in disk cache) - => 4.12M 0:16:11 (averaging 4.2k/s; estimate 4 hours total) - => fatcat_release_dump_expanded.json.gz: 3.4G with 6.76M lines, so expecting 35GB or so - - ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- - usr sys idl wai hiq siq| read writ| recv send| in out | int csw - 14 3 73 8 0 2| 43M 21M| 143k 326B| 0 0 | 64k 113k - 18 4 72 5 0 2| 55M 4096k| 246k 326B| 0 0 | 84k 157k - 17 4 72 6 0 2| 53M 9964k| 428k 908B| 0 0 | 84k 155k - 14 3 73 8 0 1| 46M 20M| 143k 87k| 0 0 | 71k 132k - 18 3 72 5 0 2| 52M 8880k| 318k 95k| 0 0 | 82k 149k - - => fatcat-export using about 200% CPU in a single thread; other 8 threads at about 25% each - => might be interesting to profile... too many copies? switch to tsv library? - => seems like postgres only about 50% utilized - -Benchmarks! - - # stub entity - $ wrk -t8 -c100 -d30s http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai - Running 30s test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai - 8 threads and 100 connections - Thread Stats Avg Stdev Max +/- Stdev - Latency 7.44ms 5.26ms 67.53ms 60.07% - Req/Sec 1.71k 113.55 2.07k 68.79% - 408986 requests in 30.01s, 117.01MB read - Requests/sec: 13627.62 - Transfer/sec: 3.90MB - => TPS: 39958 - => lots of "BEGIN" and "SELECT 1", implying new connections? hrm. - => should run longer for more stable results - - wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai - - Running 2m test @ http://localhost:9411/v0/work/aaaaaaaaaaaaavkvaaaaaaaaai - 12 threads and 400 connections - Thread Stats Avg Stdev Max +/- Stdev - Latency 25.75ms 17.02ms 263.74ms 65.11% - Req/Sec 1.05k 377.38 1.82k 74.81% - Latency Distribution - 50% 25.40ms - 75% 26.56ms - 90% 49.57ms - 99% 76.47ms - 1128253 requests in 2.00m, 322.80MB read - Socket errors: connect 0, read 10, write 0, timeout 0 - Requests/sec: 9396.03 - Transfer/sec: 2.69MB - - => single 470% fatcatd thread; diesel? - => 50% systemd-journald - - # rich entity - $ wrk -t8 -c100 -d30s https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container - Running 30s test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container - 8 threads and 100 connections - Thread Stats Avg Stdev Max +/- Stdev - Latency 33.35ms 14.01ms 152.05ms 79.98% - Req/Sec 363.14 30.36 464.00 76.48% - 86719 requests in 30.04s, 234.46MB read - Requests/sec: 2886.55 - Transfer/sec: 7.80MB - - wrk -t12 -c400 -d2m --latency https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container - - Running 2m test @ https://api.qa.fatcat.wiki/v0/release/aaaaaaaaaaaaarceaaaaaaaaam?expand=files,container - 12 threads and 400 connections - Thread Stats Avg Stdev Max +/- Stdev - Latency 149.19ms 125.30ms 1.93s 95.06% - Req/Sec 237.89 34.60 650.00 73.54% - Latency Distribution - 50% 129.86ms - 75% 134.14ms - 90% 210.42ms - 99% 904.26ms - 340625 requests in 2.00m, 0.90GB read - Socket errors: connect 0, read 0, write 0, timeout 303 - Requests/sec: 2836.17 - Transfer/sec: 7.67MB - - => TPS: 9090 - ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- - usr sys idl wai hiq siq| read writ| recv send| in out | int csw - 4 0 93 3 0 0|2401k 13M| 0 0 |2317B 4812B|4274 3824 - 22 6 68 0 0 3|4096B 0 | 57k 956B|4096B 0 | 135k 255k - 21 7 68 0 0 3|4096B 0 | 61k 66B|4096B 0 | 137k 259k - 21 7 68 0 0 3| 16k 8192B| 179k 636B| 16k 0 | 134k 252k - 21 7 68 0 0 3| 0 0 | 53k 66B| 0 0 | 135k 255k - 22 6 68 0 0 3| 0 17M| 51k 326B| 0 0 | 137k 261k - 22 6 68 0 0 3| 0 0 | 82k 903B| 0 0 | 136k 257k - - # static (404) - wrk -t12 -c400 -d2m --latency http://localhost:9411/v0/asdf - => full 2600+% CPU in a single fatcat process (?) - => lots of load in wrk also - => system at a whole basically 100% CPU on all cores, though mostly kernel - - ----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- - usr sys idl wai hiq siq| read writ| recv send| in out | int csw - 4 0 93 3 0 0|2399k 13M| 0 0 |2315B 4808B|4359 3976 - 26 66 2 0 0 6| 0 0 |4191k 956B| 0 0 | 465k 536k - 26 67 1 0 0 6| 0 0 | 511k 66B| 0 0 | 466k 543k - 27 64 3 0 0 6| 0 0 |2648k 318B| 0 0 | 481k 555k - 25 67 2 0 0 6| 0 0 |1909k 768B| 0 0 | 481k 550k - - Running 2m test @ http://localhost:9411/v0/asdf - 12 threads and 400 connections - Thread Stats Avg Stdev Max +/- Stdev - Latency 1.40ms 1.65ms 219.48ms 93.33% - Req/Sec 17.71k 8.97k 65.69k 63.84% - Latency Distribution - 50% 1.13ms - 75% 1.89ms - 90% 2.71ms - 99% 4.76ms - 21145472 requests in 2.00m, 1.61GB read - Socket errors: connect 0, read 12, write 0, timeout 0 - Non-2xx or 3xx responses: 21145472 - Requests/sec: 176071.79 - Transfer/sec: 13.77MB - -Noting pretty non-trival traffic to 207.241.229.230 and -es-worker-02.us.archive.org (207.241.229.246)... probably because this is a -datanode? - -## Misc - -Postgres really wanted to log: - - ALTER DATABASE fatcat_prod SET log_statement = 'ddl'; - ALTER USER fatcat SET log_statement = 'ddl'; - - -## Later Imports - - zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - - => only ~40 TPS - => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index? - - relname | too_much_seq | case | rel_size | seq_scan | idx_scan - ----------------------+--------------+----------------+-------------+----------+---------- - file_rev | -5343173 | OK | 867483648 | 693 | 5343866 - - EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; - -Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success. - - http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found - http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found - => but no releases? - -SET auto_explain.log_min_duration = 0; -SET auto_explain.log_analyze = true; -ALTER USER fatcat SET log_statement = 'all'; -ALTER USER fatcat SET auto_explain.log_min_duration = 0; -ALTER USER fatcat SET auto_explain.log_analyze = true;; -sudo service fatcat-api restart - - -2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG: duration: 3028.143 ms plan: - Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3 - Limit (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1) - -> Nested Loop (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1) - -> Seq Scan on file_rev (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1) - Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text)) - Rows Removed by Filter: 6607786 - -> Index Scan using file_ident_rev_idx on file_ident (cost=0.43..1.92 rows=1 width=49) (never executed) - Index Cond: (rev_id = file_rev.id) - Filter: (is_live AND (redirect_id IS NULL)) - - -fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; - QUERY PLAN --------------------------------------------------------------------------------------------------------------------------- - Limit (cost=0.99..5.04 rows=1 width=454) - -> Nested Loop (cost=0.99..5.04 rows=1 width=454) - -> Index Scan using file_rev_sha1_idx on file_rev (cost=0.56..2.58 rows=1 width=405) - Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar)) - -> Index Scan using file_ident_rev_idx on file_ident (cost=0.43..2.45 rows=1 width=49) - Index Cond: (rev_id = file_rev.id) - Filter: (is_live AND (redirect_id IS NULL)) - -After all this debugging, asked on IRC and learned that I really shouldn't ever -use CHAR() (or even VARCHAR()). Will update the schema. - -## Investigating Missing Rows - - fatcat_prod=# select count(*) from release_ident; => 53028167 - -Thought I had more than that previously? Table sizes were also bigger. Hrm. - diff --git a/notes/import_timing_20180920.txt b/notes/import_timing_20180920.txt deleted file mode 100644 index a57ffd77..00000000 --- a/notes/import_timing_20180920.txt +++ /dev/null @@ -1,96 +0,0 @@ - -This was on fatcat-prod-vm (2TB disk). - - time ./fatcat_import.py import-issn /srv/fatcat/datasets/journal_extra_metadata.csv - - Processed 53300 lines, inserted 53283, updated 0. - real 0m32.463s - user 0m8.716s - sys 0m0.284s - - time parallel --bar --pipepart -j8 -a /srv/fatcat/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - - - Processed 48900 lines, inserted 48731, updated 0. <= these numbers times 80x - 100% 80:0=0s - - real 10m20.598s - user 26m16.544s - sys 1m40.284s - - time xzcat /srv/fatcat/datasets/crossref-works.2018-01-21.json.xz | time parallel -j20 --round-robin --pipe ./fatcat_import.py import-crossref - /srv/fatcat/datasets/20180216.ISSN-to-ISSN-L.txt /srv/fatcat/datasets/release_ids.ia_munge_20180908.sqlite3 - - Processed 4679900 lines, inserted 3755867, updated 0. - 107730.08user 4110.22system 16:31:25elapsed 188%CPU (0avgtext+0avgdata 447496maxresident)k - 77644160inputs+361948352outputs (105major+49094767minor)pagefaults 0swaps - - => 16.5 hours, faster! - - select count(id) from release_ident; => 75106713 - - kernel/system crashed after first file import (!), so don't have numbers from that. - -Table sizes at this point: - - select count(id) from file_ident; => 6334606 - - Size: 389.25G - - table_name | table_size | indexes_size | total_size ---------------------------------------------------------------+------------+--------------+------------ - "public"."release_ref" | 170 GB | 47 GB | 217 GB - "public"."release_rev" | 44 GB | 21 GB | 65 GB - "public"."release_contrib" | 19 GB | 20 GB | 39 GB - "public"."release_edit" | 6671 MB | 6505 MB | 13 GB - "public"."work_edit" | 6671 MB | 6505 MB | 13 GB - "public"."release_ident" | 4892 MB | 5875 MB | 11 GB - "public"."work_ident" | 4892 MB | 5874 MB | 11 GB - "public"."work_rev" | 3174 MB | 2936 MB | 6109 MB - "public"."file_rev_url" | 3634 MB | 1456 MB | 5090 MB - "public"."file_rev" | 792 MB | 1281 MB | 2073 MB - "public"."abstracts" | 1665 MB | 135 MB | 1800 MB - "public"."file_edit" | 565 MB | 561 MB | 1126 MB - "public"."file_release" | 380 MB | 666 MB | 1045 MB - "public"."file_ident" | 415 MB | 496 MB | 911 MB - "public"."creator_rev" | 371 MB | 457 MB | 828 MB - "public"."creator_edit" | 347 MB | 353 MB | 700 MB - "public"."creator_ident" | 255 MB | 305 MB | 559 MB - "public"."release_rev_abstract" | 183 MB | 237 MB | 421 MB - "public"."changelog" | 122 MB | 126 MB | 247 MB - "public"."editgroup" | 138 MB | 81 MB | 219 MB - "public"."container_rev" | 52 MB | 38 MB | 89 MB - "public"."container_edit" | 32 MB | 30 MB | 62 MB - "public"."container_ident" | 24 MB | 28 MB | 52 MB - - - relname | too_much_seq | case | rel_size | seq_scan | idx_scan -----------------------+--------------+------+--------------+----------+---------- - release_edit | 0 | OK | 6993084416 | 0 | 0 - container_rev | 0 | OK | 54124544 | 0 | 0 - creator_ident | 0 | OK | 266919936 | 0 | 0 - creator_edit | 0 | OK | 363921408 | 0 | 0 - work_rev | 0 | OK | 3327262720 | 0 | 0 - creator_rev | 0 | OK | 388726784 | 0 | 0 - work_ident | 0 | OK | 5128560640 | 0 | 0 - work_edit | 0 | OK | 6993108992 | 0 | 0 - container_ident | 0 | OK | 25092096 | 0 | 0 - file_edit | 0 | OK | 598278144 | 0 | 0 - container_edit | 0 | OK | 33857536 | 0 | 0 - changelog | 0 | OK | 127549440 | 0 | 0 - abstracts | -4714 | OK | 1706713088 | 0 | 4714 - file_release | -13583 | OK | 401752064 | 0 | 13583 - file_rev_url | -13583 | OK | 3832389632 | 0 | 13583 - editgroup | -74109 | OK | 144277504 | 0 | 74109 - release_contrib | -76699 | OK | 20357849088 | 0 | 76699 - release_ref | -76700 | OK | 183009157120 | 3 | 76703 - release_rev_abstract | -76939 | OK | 192102400 | 0 | 76939 - release_rev | -77965 | OK | 47602647040 | 0 | 77965 - file_ident | -100089 | OK | 438255616 | 3 | 100092 - release_ident | -152809 | OK | 5128617984 | 0 | 152809 - file_rev | -440780 | OK | 837705728 | 0 | 440780 -(23 rows) - -Continuing imports: - - zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - - => HTTP response body: {"message":"duplicate key value violates unique constraint \"file_edit_editgroup_id_ident_id_key\""} - diff --git a/notes/import_timing_20180923.txt b/notes/import_timing_20180923.txt deleted file mode 100644 index f8814f3d..00000000 --- a/notes/import_timing_20180923.txt +++ /dev/null @@ -1,39 +0,0 @@ - - 105595.18user 3903.65system 15:59:39elapsed 190%CPU (0avgtext+0avgdata 458836maxresident)k - 71022792inputs+327828472outputs (176major+31149593minor)pagefaults 0swaps - - real 959m39.521s - user 1845m10.392s - sys 70m33.780s - -Did I get the same error again? I'm confused: - - HTTP response body: {"message":"number of parameters must be between 0 and 65535\n"} - (but not in all threads) - -Yes, ugh, because 50*2500 can be over (it's not just individual large releases, -they come in big batches). - -But: - - select count(id) from release_ident; => 70006121 - -A lot, though not 72 million like last time, hrm. I'm... going to move ahead I -guess. - -"Processed 4440850 lines, inserted 3509600, updated 0." - => implies 79029915 records - - time zcat /srv/fatcat/datasets/ia_papers_manifest_2018-01-25.matched.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched --no-file-update - - Processed 530750 lines, inserted 435239, updated 0. (etc) - Command exited with non-zero status 1 - 15121.47user 676.49system 2:23:52elapsed 183%CPU (0avgtext+0avgdata 70076maxresident)k - 127760inputs+3477184outputs (116major+475489minor)pagefaults 0swaps - - real 143m52.681s - user 252m31.620s - sys 11m21.608s - - zcat /srv/fatcat/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - - - (running...) diff --git a/notes/initial_sources.txt b/notes/initial_sources.txt deleted file mode 100644 index cc22019d..00000000 --- a/notes/initial_sources.txt +++ /dev/null @@ -1,26 +0,0 @@ - -Probably start with: - - crossref (including citations) - arxiv - medline - -then merge in: - - dblp - CORE - MSAG dump - VIAF - archive.org paper/url manifest - semantic scholar - oaDOI - -and later: - - wikidata - opencitations - openlibrary - -national libraries: - - http://www.dnb.de/EN/Service/DigitaleDienste/LinkedData/linkeddata_node.html diff --git a/notes/old_imports.txt b/notes/old_imports.txt deleted file mode 100644 index 1233d4a8..00000000 --- a/notes/old_imports.txt +++ /dev/null @@ -1,20 +0,0 @@ - -## ORCID - -Directly from compressed tarball; takes about 2 hours in production: - - tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | grep '"person":' | time parallel -j12 --pipe --round-robin ./fatcat_import.py import-orcid - - -After tuning database, `jq` CPU seems to be bottleneck, so, from pre-extracted -tarball: - - tar xf /srv/datasets/public_profiles_API-2.0_2017_10_json.tar.gz -O | jq -c . | rg '"person":' > /srv/datasets/public_profiles_1_2_json.all.json - time parallel --bar --pipepart -j8 -a /srv/datasets/public_profiles_1_2_json.all.json ./fatcat_import.py import-orcid - - -Does not work: - - ./fatcat_import.py import-orcid /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json - -Instead: - - cat /data/orcid/partial/public_profiles_API-2.0_2017_10_json/3/0000-0001-5115-8623.json | jq -c . | ./fatcat_import.py import-orcid - diff --git a/notes/performance/postgres_performance.txt b/notes/performance/postgres_performance.txt new file mode 100644 index 00000000..cd2a5162 --- /dev/null +++ b/notes/performance/postgres_performance.txt @@ -0,0 +1,203 @@ + +## Setup + +Add to postgres.conf: + + shared_preload_libraries = 'auto_explain,pg_stat_statements' + + # Increase the max size of the query strings Postgres records + track_activity_query_size = 2048 + + # Track statements generated by stored procedures as well + pg_stat_statements.track = all + +Also: + + track_counts (already default) + autovacuum (already default?) + log_min_error = warning + log_min_duration_statement = 5000 + +Then from shell: + + create extension pg_stat_statements; + +Regularly want to run: + + VACUUM ANALYZE + +## Tuning Values + +postgres config: + + max_connections = 100 (default) + shared_buffers = 128MB -> 10GB (while elastic on same machine; later 16 or more) + effective_cache_size = 4GB -> 24GB (while elastic on same machine) + work_mem = 4MB -> 128MB # relatively few connections/box + fsync = on + commit_delay = ??? (and siblings) + random_page_cost = 1 (for SSD) + default_statistics_target = 100 -> 200 + maintenance_work_mem = 64MB -> 8GB + synchronous_commit = off (during dev only! switch to on for production!) + wal_sync_method (keep default) + max_wal_size = 64 -> 128 (based on above HINT message) + # didn't mess with commit_delay/commit_siblings + +system: + + sysctl -w vm.overcommit_memory=2 + TODO: ulimit -n 65536 + TODO: ulimit -p 800 + LimitNOFILE + /lib/systemd/system/postgresql.service + +## Resources + +https://www.geekytidbits.com/performance-tuning-postgres/ + +Could try pgbadger to handle auto_explain type output. + +https://www.postgresql.org/docs/10/static/runtime-config-wal.html + +IA-specific resources: + https://git.archive.org/ia/mint/blob/master/postgres/postgres_9_2.yml + https://git.archive.org/ia/mint/blob/master/postgres/put_datadir_on_ssd.sh + https://git.archive.org/ia/mint/blob/master/postgres/templates/postgresql.conf.j2 + +For bulk inserts: +- make write-ahead-log larger (eg, 16MB. done.) +- transactions of ~1000+ inserts +- https://www.postgresql.org/docs/current/static/populate.html +- https://www.depesz.com/2007/07/05/how-to-insert-data-to-database-as-fast-as-possible/ +- https://stackoverflow.com/questions/12206600/how-to-speed-up-insertion-performance-in-postgresql + +## 2018-06-27 Measurements (pre-tuning) + +fatcat_prod=# select count(*) from release_ident; 20983019 +fatcat_prod=# select count(*) from work_ident; 20988140 +fatcat_prod=# select count(*) from file_ident; 1482335 +fatcat_prod=# select count(*) from creator_ident; 4167419 +fatcat_prod=# select count(*) from container_ident; 61793 + +select count(*) from release_contrib; 59798133 + +bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ +43G + +running import-crossref with 20 threads, and manifest importer with one (at 33% +complete). had already imported ~7million works+releases previously. + + + PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgre + Size: 41.38G - 323.40K/s | TPS: 885 + Mem.: 50.80% - 23.86G/49.14G | IO Max: 79539/s + Swap: 0.80% - 408.89M/50.00G | Read : 67.04K/s - 16/s + Load: 6.69 7.41 7.69 | Write: 1.93M/s - 493/s + + +----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- +usr sys idl wai hiq siq| read writ| recv send| in out | int csw + 32 6 62 0 0 0| 296k 3880k| 334k 3144B| 0 0 | 21k 65k + 31 6 62 0 0 0| 0 3072k| 391k 318B| 0 0 | 51k 141k + 31 6 63 0 0 0| 16k 1080k| 344k 1988B| 0 0 | 35k 104k + 29 6 65 0 0 0| 136k 2608k| 175k 332B| 0 0 |9835 15k + 28 5 67 0 0 0| 408k 4368k| 285k 832B| 0 0 | 14k 17k + 33 5 62 0 0 0| 56k 3256k| 219k 99B| 0 0 | 22k 49k + 31 6 63 0 0 0| 188k 5120k| 158k 318B| 0 0 | 17k 29k + 30 6 64 0 0 0| 200k 6984k| 239k 988B| 0 0 | 16k 24k + 30 6 64 0 0 0| 168k 5504k| 159k 152B| 0 0 | 14k 20k + 28 7 65 0 0 0| 440k 12M| 236k 420B| 0 0 | 15k 18k + 29 6 65 0 0 0| 428k 6968k| 352k 310B| 0 0 | 19k 31k + 32 6 62 0 0 0| 64k 3480k| 288k 318B| 0 0 | 18k 55k + 32 6 62 0 0 0| 32k 2080k| 155k 318B| 0 0 | 20k 52k + + +bnewbold@wbgrp-svc500$ uptime + 22:00:42 up 28 days, 22:31, 6 users, load average: 7.94, 7.56, 7.72 + + +2018-06-27 21:57:36.102 UTC [401] LOG: checkpoints are occurring too frequently (13 seconds apart) +2018-06-27 21:57:36.102 UTC [401] HINT: Consider increasing the configuration parameter "max_wal_size". + + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan +-----------------+--------------+----------------+-------------+----------+----------- + changelog | 1274670 | Missing Index? | 39411712 | 1274670 | 0 + file_edit | 612386 | Missing Index? | 108298240 | 612386 | 0 + creator_edit | 612386 | Missing Index? | 285540352 | 612386 | 0 + container_edit | 612386 | Missing Index? | 4784128 | 612386 | 0 + release_edit | 612386 | Missing Index? | 1454489600 | 612386 | 0 + work_edit | 612386 | Missing Index? | 1454415872 | 612386 | 0 + release_contrib | 296675 | Missing Index? | 4725645312 | 296675 | 0 + release_ref | 296663 | Missing Index? | 8999837696 | 296663 | 0 + file_release | -113 | OK | 13918208 | 110 | 223 + container_rev | -979326 | OK | 16285696 | 63 | 979389 + file_ident | -3671516 | OK | 109002752 | 362 | 3671878 + file_rev | -3944155 | OK | 302940160 | 95 | 3944250 + creator_rev | -8420205 | OK | 318283776 | 1226 | 8421431 + creator_ident | -9525338 | OK | 309141504 | 52330 | 9577668 + container_ident | -20581876 | OK | 4833280 | 272457 | 20854333 + release_ident | -40548858 | OK | 1440948224 | 4160919 | 44709777 + work_rev | -42534913 | OK | 1124671488 | 1161 | 42536074 + editgroup | -48864662 | OK | 34136064 | 1 | 48864663 + work_ident | -65008911 | OK | 1503313920 | 1239 | 65010150 + release_rev | -185735794 | OK | 13649428480 | 128 | 185735922 + +## 2018-06-28 (after basic tuning + indexes) + +Early loading (manifest and 20x release): + + PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s + Size: 4.57G - 6.45M/s | TPS: 18812 + Mem.: 59.70% - 23.62G/49.14G | IO Max: 3601/s + Swap: 1.30% - 675.05M/50.00G | Read : 0.00B/s - 0/s + Load: 12.98 10.58 5.25 | Write: 2.65M/s - 677/s + + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 24045 webcrawl 20 0 908872 204948 11756 S 153.3 0.4 16:13.03 fatcatd + 24328 webcrawl 20 0 78148 45220 4324 R 87.1 0.1 8:44.16 perl + 24056 postgres 20 0 10.441g 3.906g 3.886g R 69.9 7.9 6:57.47 postgres + 24063 postgres 20 0 10.447g 3.899g 3.873g S 67.9 7.9 6:55.89 postgres + 24059 postgres 20 0 10.426g 3.888g 3.883g R 67.5 7.9 6:59.15 postgres + 24057 postgres 20 0 10.430g 3.883g 3.874g S 67.2 7.9 6:58.68 postgres + 24061 postgres 20 0 10.448g 3.909g 3.881g R 66.2 8.0 6:54.30 postgres + 24058 postgres 20 0 10.428g 3.883g 3.876g R 65.9 7.9 6:59.35 postgres + 24062 postgres 20 0 10.426g 5.516g 5.511g R 64.9 11.2 6:58.29 postgres + 24055 postgres 20 0 10.426g 3.878g 3.873g R 64.2 7.9 6:59.38 postgres + 24054 postgres 20 0 10.430g 5.499g 5.491g R 63.6 11.2 6:57.27 postgres + 24060 postgres 20 0 10.448g 3.900g 3.873g R 61.9 7.9 6:55.45 postgres + 21711 postgres 20 0 10.419g 5.762g 5.760g D 16.6 11.7 3:00.67 postgres + 21713 postgres 20 0 10.419g 21432 19512 S 11.3 0.0 3:25.11 postgres + 24392 webcrawl 20 0 5309636 400912 8696 S 7.9 0.8 0:53.18 python3 + 24383 webcrawl 20 0 5309436 400628 8648 S 7.6 0.8 0:52.29 python3 + 24387 webcrawl 20 0 5309776 402968 8620 S 7.3 0.8 0:52.81 python3 + 24394 webcrawl 20 0 5309624 400732 8644 S 7.3 0.8 0:53.30 python3 + 24384 webcrawl 20 0 5309916 400948 8600 S 7.0 0.8 0:53.18 python3 + +Still get a *lot* of: + + 2018-06-29 00:14:05.948 UTC [21711] LOG: checkpoints are occurring too frequently (1 second apart) + 2018-06-29 00:14:05.948 UTC [21711] HINT: Consider increasing the configuration parameter "max_wal_size". + +VACUUM is running basically continuously; should prevent that? 6 hours or +longer on release_rev and release_ref tables. An auto-approve batch method +would resovle this, I think (no update after insert). + +max_wal_size wasn't getting set correctly. + +The statements taking the most time are the complex inserts (multi-table +inserts); they take a fraction of a second though (mean less than a +milisecond). + +Manifest import runs really slow if release import is concurrent; much faster +to wait until release import is done first (like a factor of 10x or more). + +With some 60 million releases: + + bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ + 184G /var/lib/postgresql/ + +TODO: slow query log doesn't seem to be working (let alone auto_explain) + diff --git a/notes/performance/speed.txt b/notes/performance/speed.txt new file mode 100644 index 00000000..f885aea7 --- /dev/null +++ b/notes/performance/speed.txt @@ -0,0 +1,82 @@ + +## Early Prototyping + +### 2018-04-23 + +- fatcat as marshmallow+sqlalchemy+flask, with API client +- no refs, contibs, files, release contribs, containers, etc +- no extra_json +- sqlite +- laptop +- editgroup every 250 edits + + + /data/crossref/crossref-works.2018-01-21.badsample_5k.json + + real 3m42.912s + user 0m20.448s + sys 0m2.852s + + ~22 lines per second + 12.5 hours per million + ~52 days for crossref (100 million) + +target: + crossref (100 million) loaded in 48 hours + 579 lines per second + this test in under 10 seconds + ... but could be in parallel + +same except postgres, via: + + docker run -p 5432:5432 postgres:latest + ./run.py --init-db --database-uri postgres://postgres@localhost:5432 + ./run.py --database-uri postgres://postgres@localhost:5432 + + API processing using 60-100% of a core. postgres 12% of a core; + docker-proxy similar (!). overall 70 of system CPU idle. + + real 2m27.771s + user 0m22.860s + sys 0m2.852s + +no profiling yet; need to look at database ops. probably don't even have any +indices! + +## Rust Updates (2018-05-23) + +Re-running with tweaked python code, 5k sample file, postgres 9.6 running locally (not in docker): + + real 2m27.598s + user 0m24.892s + sys 0m2.836s + +Using postgres and fatcat rust: + + real 0m44.443s + user 0m25.288s + sys 0m0.880s + +api_client about half a core; fatcatd 3x processes, about 10% each; postgres +very small. + +a bit faster, basically maxing out CPU: + + time cat /data/crossref/crossref-works.2018-01-21.badsample_5k.json | parallel -j4 --pipe ./fatcat_client.py --host-url http://localhost:9411 ic - + + real 0m28.998s + user 1m5.304s + sys 0m3.420s + + 200 lines per second; within a factor of 3; can perhaps hit target with + non-python client? + +python processes (clients) seem to be CPU limit in this case; all 4 cores +effectively maxed out. + +running python again in parallel mode: + + real 2m29.532s + user 0m47.692s + sys 0m4.840s + diff --git a/notes/postgres_performance.txt b/notes/postgres_performance.txt deleted file mode 100644 index cd2a5162..00000000 --- a/notes/postgres_performance.txt +++ /dev/null @@ -1,203 +0,0 @@ - -## Setup - -Add to postgres.conf: - - shared_preload_libraries = 'auto_explain,pg_stat_statements' - - # Increase the max size of the query strings Postgres records - track_activity_query_size = 2048 - - # Track statements generated by stored procedures as well - pg_stat_statements.track = all - -Also: - - track_counts (already default) - autovacuum (already default?) - log_min_error = warning - log_min_duration_statement = 5000 - -Then from shell: - - create extension pg_stat_statements; - -Regularly want to run: - - VACUUM ANALYZE - -## Tuning Values - -postgres config: - - max_connections = 100 (default) - shared_buffers = 128MB -> 10GB (while elastic on same machine; later 16 or more) - effective_cache_size = 4GB -> 24GB (while elastic on same machine) - work_mem = 4MB -> 128MB # relatively few connections/box - fsync = on - commit_delay = ??? (and siblings) - random_page_cost = 1 (for SSD) - default_statistics_target = 100 -> 200 - maintenance_work_mem = 64MB -> 8GB - synchronous_commit = off (during dev only! switch to on for production!) - wal_sync_method (keep default) - max_wal_size = 64 -> 128 (based on above HINT message) - # didn't mess with commit_delay/commit_siblings - -system: - - sysctl -w vm.overcommit_memory=2 - TODO: ulimit -n 65536 - TODO: ulimit -p 800 - LimitNOFILE - /lib/systemd/system/postgresql.service - -## Resources - -https://www.geekytidbits.com/performance-tuning-postgres/ - -Could try pgbadger to handle auto_explain type output. - -https://www.postgresql.org/docs/10/static/runtime-config-wal.html - -IA-specific resources: - https://git.archive.org/ia/mint/blob/master/postgres/postgres_9_2.yml - https://git.archive.org/ia/mint/blob/master/postgres/put_datadir_on_ssd.sh - https://git.archive.org/ia/mint/blob/master/postgres/templates/postgresql.conf.j2 - -For bulk inserts: -- make write-ahead-log larger (eg, 16MB. done.) -- transactions of ~1000+ inserts -- https://www.postgresql.org/docs/current/static/populate.html -- https://www.depesz.com/2007/07/05/how-to-insert-data-to-database-as-fast-as-possible/ -- https://stackoverflow.com/questions/12206600/how-to-speed-up-insertion-performance-in-postgresql - -## 2018-06-27 Measurements (pre-tuning) - -fatcat_prod=# select count(*) from release_ident; 20983019 -fatcat_prod=# select count(*) from work_ident; 20988140 -fatcat_prod=# select count(*) from file_ident; 1482335 -fatcat_prod=# select count(*) from creator_ident; 4167419 -fatcat_prod=# select count(*) from container_ident; 61793 - -select count(*) from release_contrib; 59798133 - -bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ -43G - -running import-crossref with 20 threads, and manifest importer with one (at 33% -complete). had already imported ~7million works+releases previously. - - - PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgre - Size: 41.38G - 323.40K/s | TPS: 885 - Mem.: 50.80% - 23.86G/49.14G | IO Max: 79539/s - Swap: 0.80% - 408.89M/50.00G | Read : 67.04K/s - 16/s - Load: 6.69 7.41 7.69 | Write: 1.93M/s - 493/s - - -----total-cpu-usage---- -dsk/total- -net/total- ---paging-- ---system-- -usr sys idl wai hiq siq| read writ| recv send| in out | int csw - 32 6 62 0 0 0| 296k 3880k| 334k 3144B| 0 0 | 21k 65k - 31 6 62 0 0 0| 0 3072k| 391k 318B| 0 0 | 51k 141k - 31 6 63 0 0 0| 16k 1080k| 344k 1988B| 0 0 | 35k 104k - 29 6 65 0 0 0| 136k 2608k| 175k 332B| 0 0 |9835 15k - 28 5 67 0 0 0| 408k 4368k| 285k 832B| 0 0 | 14k 17k - 33 5 62 0 0 0| 56k 3256k| 219k 99B| 0 0 | 22k 49k - 31 6 63 0 0 0| 188k 5120k| 158k 318B| 0 0 | 17k 29k - 30 6 64 0 0 0| 200k 6984k| 239k 988B| 0 0 | 16k 24k - 30 6 64 0 0 0| 168k 5504k| 159k 152B| 0 0 | 14k 20k - 28 7 65 0 0 0| 440k 12M| 236k 420B| 0 0 | 15k 18k - 29 6 65 0 0 0| 428k 6968k| 352k 310B| 0 0 | 19k 31k - 32 6 62 0 0 0| 64k 3480k| 288k 318B| 0 0 | 18k 55k - 32 6 62 0 0 0| 32k 2080k| 155k 318B| 0 0 | 20k 52k - - -bnewbold@wbgrp-svc500$ uptime - 22:00:42 up 28 days, 22:31, 6 users, load average: 7.94, 7.56, 7.72 - - -2018-06-27 21:57:36.102 UTC [401] LOG: checkpoints are occurring too frequently (13 seconds apart) -2018-06-27 21:57:36.102 UTC [401] HINT: Consider increasing the configuration parameter "max_wal_size". - - - relname | too_much_seq | case | rel_size | seq_scan | idx_scan ------------------+--------------+----------------+-------------+----------+----------- - changelog | 1274670 | Missing Index? | 39411712 | 1274670 | 0 - file_edit | 612386 | Missing Index? | 108298240 | 612386 | 0 - creator_edit | 612386 | Missing Index? | 285540352 | 612386 | 0 - container_edit | 612386 | Missing Index? | 4784128 | 612386 | 0 - release_edit | 612386 | Missing Index? | 1454489600 | 612386 | 0 - work_edit | 612386 | Missing Index? | 1454415872 | 612386 | 0 - release_contrib | 296675 | Missing Index? | 4725645312 | 296675 | 0 - release_ref | 296663 | Missing Index? | 8999837696 | 296663 | 0 - file_release | -113 | OK | 13918208 | 110 | 223 - container_rev | -979326 | OK | 16285696 | 63 | 979389 - file_ident | -3671516 | OK | 109002752 | 362 | 3671878 - file_rev | -3944155 | OK | 302940160 | 95 | 3944250 - creator_rev | -8420205 | OK | 318283776 | 1226 | 8421431 - creator_ident | -9525338 | OK | 309141504 | 52330 | 9577668 - container_ident | -20581876 | OK | 4833280 | 272457 | 20854333 - release_ident | -40548858 | OK | 1440948224 | 4160919 | 44709777 - work_rev | -42534913 | OK | 1124671488 | 1161 | 42536074 - editgroup | -48864662 | OK | 34136064 | 1 | 48864663 - work_ident | -65008911 | OK | 1503313920 | 1239 | 65010150 - release_rev | -185735794 | OK | 13649428480 | 128 | 185735922 - -## 2018-06-28 (after basic tuning + indexes) - -Early loading (manifest and 20x release): - - PostgreSQL 10.4 - wbgrp-svc500.us.archive.org - postgres@localhost:5432/postgres - Ref.: 2s - Size: 4.57G - 6.45M/s | TPS: 18812 - Mem.: 59.70% - 23.62G/49.14G | IO Max: 3601/s - Swap: 1.30% - 675.05M/50.00G | Read : 0.00B/s - 0/s - Load: 12.98 10.58 5.25 | Write: 2.65M/s - 677/s - - - PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND - 24045 webcrawl 20 0 908872 204948 11756 S 153.3 0.4 16:13.03 fatcatd - 24328 webcrawl 20 0 78148 45220 4324 R 87.1 0.1 8:44.16 perl - 24056 postgres 20 0 10.441g 3.906g 3.886g R 69.9 7.9 6:57.47 postgres - 24063 postgres 20 0 10.447g 3.899g 3.873g S 67.9 7.9 6:55.89 postgres - 24059 postgres 20 0 10.426g 3.888g 3.883g R 67.5 7.9 6:59.15 postgres - 24057 postgres 20 0 10.430g 3.883g 3.874g S 67.2 7.9 6:58.68 postgres - 24061 postgres 20 0 10.448g 3.909g 3.881g R 66.2 8.0 6:54.30 postgres - 24058 postgres 20 0 10.428g 3.883g 3.876g R 65.9 7.9 6:59.35 postgres - 24062 postgres 20 0 10.426g 5.516g 5.511g R 64.9 11.2 6:58.29 postgres - 24055 postgres 20 0 10.426g 3.878g 3.873g R 64.2 7.9 6:59.38 postgres - 24054 postgres 20 0 10.430g 5.499g 5.491g R 63.6 11.2 6:57.27 postgres - 24060 postgres 20 0 10.448g 3.900g 3.873g R 61.9 7.9 6:55.45 postgres - 21711 postgres 20 0 10.419g 5.762g 5.760g D 16.6 11.7 3:00.67 postgres - 21713 postgres 20 0 10.419g 21432 19512 S 11.3 0.0 3:25.11 postgres - 24392 webcrawl 20 0 5309636 400912 8696 S 7.9 0.8 0:53.18 python3 - 24383 webcrawl 20 0 5309436 400628 8648 S 7.6 0.8 0:52.29 python3 - 24387 webcrawl 20 0 5309776 402968 8620 S 7.3 0.8 0:52.81 python3 - 24394 webcrawl 20 0 5309624 400732 8644 S 7.3 0.8 0:53.30 python3 - 24384 webcrawl 20 0 5309916 400948 8600 S 7.0 0.8 0:53.18 python3 - -Still get a *lot* of: - - 2018-06-29 00:14:05.948 UTC [21711] LOG: checkpoints are occurring too frequently (1 second apart) - 2018-06-29 00:14:05.948 UTC [21711] HINT: Consider increasing the configuration parameter "max_wal_size". - -VACUUM is running basically continuously; should prevent that? 6 hours or -longer on release_rev and release_ref tables. An auto-approve batch method -would resovle this, I think (no update after insert). - -max_wal_size wasn't getting set correctly. - -The statements taking the most time are the complex inserts (multi-table -inserts); they take a fraction of a second though (mean less than a -milisecond). - -Manifest import runs really slow if release import is concurrent; much faster -to wait until release import is done first (like a factor of 10x or more). - -With some 60 million releases: - - bnewbold@wbgrp-svc500$ sudo du -sh /var/lib/postgresql/ - 184G /var/lib/postgresql/ - -TODO: slow query log doesn't seem to be working (let alone auto_explain) - diff --git a/notes/schema/alignments.csv b/notes/schema/alignments.csv new file mode 100644 index 00000000..b8619ddc --- /dev/null +++ b/notes/schema/alignments.csv @@ -0,0 +1,57 @@ +fatcat entity,fatcat field,description,Crossref,ORCID,Bibtex,CSL,BIBFRAME,Dublin Core (DCMES),Dublin Core (DCMI),oai_dc (OAI-PMH),resourceSync,Highwire Press (google scholar),MEDLINE (?) +WORK,,,,,,,Work,,,,,, +,work_type,,,,,,,Type,,,,, +RELEASE,,,,,,,Instance,,,,,, +,title,,title,,title,,,Title,,title,,citation_title, +,release_type,,type (???),,object type; howpublished,,,Type,,,,, +,release_status,"NEW; eg, preprint, final",,,,,,,,,,, +,container,,ISSN (via lookup),,ISSN (via lookup),,,Relation (?),,,,citation_issn (normalized), +,release_date,RENAME,published-print or published-online,,"year, month",issued,,Date,Issued (?),,,citation_publication_date, +,volume,,volume,,volume,,,,,,,citation_volume, +,pages,,page,,pages,,,,,,,"citation_firstpage, citation_lastpage", +,issue,,issue,,,,,,,,,citation_issue, +,doi,always lower-case,DOI,,doi,DOI,,Identifier,,,,citation_doi, +,isbn13,,ISBN (converted),,,ISBN,,Identifier,,,,citation_isbn, +,publisher,,publisher,,publisher,,,Publisher,,,,"citation_dissertation_institution, citation_technical_report_institution", +,language,NEW; RFC1766 (ISO 639-1 superset); may transition,,,,,,Language,,,,, +,ref:index,,reference:[index],,,Citation-number,,,,,,, +,ref:key,NEW,reference:key,,,citation-label,,,,,,, +,ref:raw,RENAME,reference:unstructured,,,,,,,,,, +,ref:container_title,NEW,reference:journal-title,,,,,,,,,, +,ref:year,NEW,reference:year,,,,,,,,,, +,ref:title,NEW,,,,,,,,,,, +,ref:locator,"NEW; (aka, page number)",,,,,,,,,,, +,contrib:role,,,,,,Role,,,,,, +,contrib:index,,author[index],,,,,,,,,, +,contrib:raw,RENAME,Author:{given+family},,author,,,,,,,, +,extra:number,tech reports,,,,,,,,,,citation_technical_report_number, +,extra:institution,for tech reports and dissertations,,,,,,,,,,, +,extra:abstract,,,,,,,Description,,,,citation_abstract, +,extra:pmcid,,,,,,,,,,,, +,extra:pmid,,,,,,,,,,,, +,extra:version,"eg, for software",,,,,,,,,,, +,,,,,,,,,,,,, +CREATOR,,,,,,,Agent,Creator / Contributor,,,,, +,display_name,NEW; usually western/latinized ,,.name.credit-name.value (?),,,,,,,,citation_author, +,orcid,,,.path,,,,,,,,, +,given_name,NEW,,.name.given-names.value,,,,,,,,, +,surname,NEW,,.name.family-names.value,,,,,,,,, +,extra:viaf,,,,,,,,,,,, +,extra:alt_names[],,,,,,,,,,,, +,,,,,,,,,,,,, +CONTAINER,,,,,,,N/A,relation.isPartOf,,,,, +,name,,,,,,,,,,,"citation_journal_title, citation_conference_title", +,publisher,,,,,,,,,,,, +,issnl,,,,,,,Identifier (?),,,,citation_issn (normalized), +,abbrev,,,,,,,,,,,citation_journal_abbrev, +,coden,,,,,,,,,,,, +,extra:issn-e,,,,,,,,,,,, +,extra:issn-p,,,,,,,,,,,, +FILE,,,,,,,Item,,,,,, +,release,,,,,,,,,,,, +,size,,,,,,,,,,length,, +,sha256,NEW,,,,,,,,,hash,, +,sha1,,,,,,,,,,hash,, +,md5,,,,,,,,,,hash,, +,url,,,,www_pdf_url,,,,,,href,citation_pdf_url, +,mimetype,NEW,,,,,,Format,,,type,, diff --git a/notes/schema/alignments.txt b/notes/schema/alignments.txt new file mode 100644 index 00000000..e2736268 --- /dev/null +++ b/notes/schema/alignments.txt @@ -0,0 +1,22 @@ +bibtex +BIBFRAME +schema.org: http://schema.org/CreativeWork +dublin core + +entity specific: +- crossref / release +- orcid / creator +- issn / container + + +## Dublin Core (original/simple) + +Usage guide: + +Quick descriptions of the "original 15" fields: + +## Citation Style Language + +Human-readable specification: + +Specifically, the "variables" and type definitions: diff --git a/notes/schema/contrib_types.txt b/notes/schema/contrib_types.txt new file mode 100644 index 00000000..01024b40 --- /dev/null +++ b/notes/schema/contrib_types.txt @@ -0,0 +1,14 @@ + +See also: + +author +editor +translator +illustrator +interviewer +reviewer + + +container-author +container-editor +reviewed-author diff --git a/notes/schema/work_release_types.txt b/notes/schema/work_release_types.txt new file mode 100644 index 00000000..6eff118b --- /dev/null +++ b/notes/schema/work_release_types.txt @@ -0,0 +1,28 @@ + +see also: + +basics: +- article-journal +- book +- chapter +- dataset +- manuscript +- paper-conference +- blog-post +- report +- thesis (aka, dissertation) + +- preprint +- conference proceeding +- book chapter +- technical report/memo +- dissertation/thesis +- blog post +- wiki page +- presentation (slides, recording) + +"publication status" instead of pre-print and manuscript types? + +abstracts as files? + +videos, slides diff --git a/notes/speed.txt b/notes/speed.txt deleted file mode 100644 index f885aea7..00000000 --- a/notes/speed.txt +++ /dev/null @@ -1,82 +0,0 @@ - -## Early Prototyping - -### 2018-04-23 - -- fatcat as marshmallow+sqlalchemy+flask, with API client -- no refs, contibs, files, release contribs, containers, etc -- no extra_json -- sqlite -- laptop -- editgroup every 250 edits - - - /data/crossref/crossref-works.2018-01-21.badsample_5k.json - - real 3m42.912s - user 0m20.448s - sys 0m2.852s - - ~22 lines per second - 12.5 hours per million - ~52 days for crossref (100 million) - -target: - crossref (100 million) loaded in 48 hours - 579 lines per second - this test in under 10 seconds - ... but could be in parallel - -same except postgres, via: - - docker run -p 5432:5432 postgres:latest - ./run.py --init-db --database-uri postgres://postgres@localhost:5432 - ./run.py --database-uri postgres://postgres@localhost:5432 - - API processing using 60-100% of a core. postgres 12% of a core; - docker-proxy similar (!). overall 70 of system CPU idle. - - real 2m27.771s - user 0m22.860s - sys 0m2.852s - -no profiling yet; need to look at database ops. probably don't even have any -indices! - -## Rust Updates (2018-05-23) - -Re-running with tweaked python code, 5k sample file, postgres 9.6 running locally (not in docker): - - real 2m27.598s - user 0m24.892s - sys 0m2.836s - -Using postgres and fatcat rust: - - real 0m44.443s - user 0m25.288s - sys 0m0.880s - -api_client about half a core; fatcatd 3x processes, about 10% each; postgres -very small. - -a bit faster, basically maxing out CPU: - - time cat /data/crossref/crossref-works.2018-01-21.badsample_5k.json | parallel -j4 --pipe ./fatcat_client.py --host-url http://localhost:9411 ic - - - real 0m28.998s - user 1m5.304s - sys 0m3.420s - - 200 lines per second; within a factor of 3; can perhaps hit target with - non-python client? - -python processes (clients) seem to be CPU limit in this case; all 4 cores -effectively maxed out. - -running python again in parallel mode: - - real 2m29.532s - user 0m47.692s - sys 0m4.840s - diff --git a/notes/work_release_types.txt b/notes/work_release_types.txt deleted file mode 100644 index 6eff118b..00000000 --- a/notes/work_release_types.txt +++ /dev/null @@ -1,28 +0,0 @@ - -see also: - -basics: -- article-journal -- book -- chapter -- dataset -- manuscript -- paper-conference -- blog-post -- report -- thesis (aka, dissertation) - -- preprint -- conference proceeding -- book chapter -- technical report/memo -- dissertation/thesis -- blog post -- wiki page -- presentation (slides, recording) - -"publication status" instead of pre-print and manuscript types? - -abstracts as files? - -videos, slides -- cgit v1.2.3