diff options
-rw-r--r-- | notes/import_timing_20180910.txt | 59 | ||||
-rw-r--r-- | notes/test_works.txt | 3 | ||||
-rw-r--r-- | python/README_import.md | 5 | ||||
-rw-r--r-- | rust/TODO | 1 | ||||
-rw-r--r-- | rust/migrations/2018-05-12-001226_init/up.sql | 14 | ||||
-rw-r--r-- | rust/src/database_schema.rs | 30 |
6 files changed, 90 insertions, 22 deletions
diff --git a/notes/import_timing_20180910.txt b/notes/import_timing_20180910.txt index 47a556e8..c9f18548 100644 --- a/notes/import_timing_20180910.txt +++ b/notes/import_timing_20180910.txt @@ -257,3 +257,62 @@ Postgres really wanted to log: <https://wiki.postgresql.org/wiki/Per-user_log_se ALTER DATABASE fatcat_prod SET log_statement = 'ddl'; ALTER USER fatcat SET log_statement = 'ddl'; + +## Later Imports + + zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + => only ~40 TPS + => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index? + + relname | too_much_seq | case | rel_size | seq_scan | idx_scan + ----------------------+--------------+----------------+-------------+----------+---------- + file_rev | -5343173 | OK | 867483648 | 693 | 5343866 + + EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; + +Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success. + + http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found + http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found + => but no releases? + +SET auto_explain.log_min_duration = 0; +SET auto_explain.log_analyze = true; +ALTER USER fatcat SET log_statement = 'all'; +ALTER USER fatcat SET auto_explain.log_min_duration = 0; +ALTER USER fatcat SET auto_explain.log_analyze = true;; +sudo service fatcat-api restart + + +2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG: duration: 3028.143 ms plan: + Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3 + Limit (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1) + -> Nested Loop (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1) + -> Seq Scan on file_rev (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1) + Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text)) + Rows Removed by Filter: 6607786 + -> Index Scan using file_ident_rev_idx on file_ident (cost=0.43..1.92 rows=1 width=49) (never executed) + Index Cond: (rev_id = file_rev.id) + Filter: (is_live AND (redirect_id IS NULL)) + + +fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1; + QUERY PLAN +-------------------------------------------------------------------------------------------------------------------------- + Limit (cost=0.99..5.04 rows=1 width=454) + -> Nested Loop (cost=0.99..5.04 rows=1 width=454) + -> Index Scan using file_rev_sha1_idx on file_rev (cost=0.56..2.58 rows=1 width=405) + Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar)) + -> Index Scan using file_ident_rev_idx on file_ident (cost=0.43..2.45 rows=1 width=49) + Index Cond: (rev_id = file_rev.id) + Filter: (is_live AND (redirect_id IS NULL)) + +After all this debugging, asked on IRC and learned that I really shouldn't ever +use CHAR() (or even VARCHAR()). Will update the schema. + +## Investigating Missing Rows + + fatcat_prod=# select count(*) from release_ident; => 53028167 + +Thought I had more than that previously? Table sizes were also bigger. Hrm. + diff --git a/notes/test_works.txt b/notes/test_works.txt index 286b4d3a..d0663e76 100644 --- a/notes/test_works.txt +++ b/notes/test_works.txt @@ -33,3 +33,6 @@ Fulltext via CORE publisher-connector: 10.1186/s12889-016-2706-9 +## Missing + +"ACE: A Novel Software Platform to Ensure the Integrity [...]" diff --git a/python/README_import.md b/python/README_import.md index 38c8406f..d481b7ae 100644 --- a/python/README_import.md +++ b/python/README_import.md @@ -112,3 +112,8 @@ From compressed: sys 19m3.576s Really sped up once not contending with Crossref import, so don't run these two at the same time. + +## Matched + + zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched - + @@ -5,6 +5,7 @@ correctness - enforce "no editing if editgroup accepted" behavior - changelog sequence without gaps - batch insert editgroup behavior; always a new editgroup? +- redirect rev_id needs to be updated when primary changes edit lifecycle - editgroup: state to track review status? diff --git a/rust/migrations/2018-05-12-001226_init/up.sql b/rust/migrations/2018-05-12-001226_init/up.sql index a6384f4e..d94e9b71 100644 --- a/rust/migrations/2018-05-12-001226_init/up.sql +++ b/rust/migrations/2018-05-12-001226_init/up.sql @@ -47,7 +47,7 @@ CREATE INDEX changelog_editgroup_idx ON changelog(editgroup_id); CREATE TABLE abstracts ( -- fixed size hash (in hex). TODO: switch to bytes - sha1 TEXT PRIMARY KEY CHECK (octet_length(sha1) == 40), + sha1 TEXT PRIMARY KEY CHECK (octet_length(sha1) = 40), content TEXT NOT NULL ); @@ -60,7 +60,7 @@ CREATE TABLE creator_rev ( given_name TEXT, surname TEXT, -- fixed size identifier - orcid TEXT CHECK(octet_length(orcid) == 19), + orcid TEXT CHECK(octet_length(orcid) = 19), -- limited size for data quality wikidata_qid TEXT CHECK(octet_length(wikidata_qid) <= 12) @@ -103,7 +103,7 @@ CREATE TABLE container_rev ( name TEXT NOT NULL, publisher TEXT, -- fixed size identifier - issnl TEXT CHECK(octet_length(issnl) == 9), + issnl TEXT CHECK(octet_length(issnl) = 9), -- limited size for data quality wikidata_qid TEXT CHECK(octet_length(wikidata_qid) <= 12), abbrev TEXT, @@ -142,9 +142,9 @@ CREATE TABLE file_rev ( size BIGINT, -- fixed size hashes (in hex). TODO: switch to binary type type - sha1 TEXT CHECK(octet_length(sha1) == 40), - sha256 TEXT CHECK(octet_length(sha256) == 64), - md5 TEXT CHECK(octet_length(md5) == 32), + sha1 TEXT CHECK(octet_length(sha1) = 40), + sha256 TEXT CHECK(octet_length(sha256) = 64), + md5 TEXT CHECK(octet_length(md5) = 32), mimetype TEXT ); @@ -198,7 +198,7 @@ CREATE TABLE release_rev ( pmid TEXT CHECK(octet_length(pmid) <= 12), pmcid TEXT CHECK(octet_length(pmcid) <= 12), wikidata_qid TEXT CHECK(octet_length(wikidata_qid) <= 12), - isbn13 TEXT CHECK(octet_length(isbn13) == 13), + isbn13 TEXT CHECK(octet_length(isbn13) = 13), core_id TEXT CHECK(octet_length(core_id) <= 12), volume TEXT, issue TEXT, diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs index bc3f6c3a..d7bba7dc 100644 --- a/rust/src/database_schema.rs +++ b/rust/src/database_schema.rs @@ -1,6 +1,6 @@ table! { abstracts (sha1) { - sha1 -> Bpchar, + sha1 -> Varchar, content -> Text, } } @@ -41,10 +41,10 @@ table! { extra_json -> Nullable<Jsonb>, name -> Text, publisher -> Nullable<Text>, - issnl -> Nullable<Bpchar>, - wikidata_qid -> Nullable<Text>, + issnl -> Nullable<Varchar>, + wikidata_qid -> Nullable<Varchar>, abbrev -> Nullable<Text>, - coden -> Nullable<Text>, + coden -> Nullable<Varchar>, } } @@ -77,8 +77,8 @@ table! { display_name -> Text, given_name -> Nullable<Text>, surname -> Nullable<Text>, - orcid -> Nullable<Bpchar>, - wikidata_qid -> Nullable<Text>, + orcid -> Nullable<Varchar>, + wikidata_qid -> Nullable<Varchar>, } } @@ -136,9 +136,9 @@ table! { id -> Uuid, extra_json -> Nullable<Jsonb>, size -> Nullable<Int8>, - sha1 -> Nullable<Bpchar>, - sha256 -> Nullable<Bpchar>, - md5 -> Nullable<Bpchar>, + sha1 -> Nullable<Varchar>, + sha256 -> Nullable<Varchar>, + md5 -> Nullable<Varchar>, mimetype -> Nullable<Text>, } } @@ -212,11 +212,11 @@ table! { release_status -> Nullable<Text>, release_date -> Nullable<Date>, doi -> Nullable<Text>, - pmid -> Nullable<Text>, - pmcid -> Nullable<Text>, - wikidata_qid -> Nullable<Text>, - isbn13 -> Nullable<Text>, - core_id -> Nullable<Text>, + pmid -> Nullable<Varchar>, + pmcid -> Nullable<Varchar>, + wikidata_qid -> Nullable<Varchar>, + isbn13 -> Nullable<Varchar>, + core_id -> Nullable<Varchar>, volume -> Nullable<Text>, issue -> Nullable<Text>, pages -> Nullable<Text>, @@ -229,7 +229,7 @@ table! { release_rev_abstract (id) { id -> Int8, release_rev -> Uuid, - abstract_sha1 -> Bpchar, + abstract_sha1 -> Varchar, mimetype -> Nullable<Text>, lang -> Nullable<Text>, } |