fix CHECK clauses

author: Bryan Newbold <bnewbold@robocracy.org> 2018-09-17 13:56:36 -0700
committer: Bryan Newbold <bnewbold@robocracy.org> 2018-09-17 13:56:36 -0700
commit: 8baa2d8808797e423829fb040d70206d24b6156e (patch)
tree: 62adfdd84118b0869fd9cc5a684d38d037a9f529
parent: 5feeb70f930e733587ad7fa33d361cf24602abca (diff)
download: fatcat-8baa2d8808797e423829fb040d70206d24b6156e.tar.gz
fatcat-8baa2d8808797e423829fb040d70206d24b6156e.zip
6 files changed, 90 insertions, 22 deletions
diff --git a/notes/import_timing_20180910.txt b/notes/import_timing_20180910.txt
index 47a556e8..c9f18548 100644
--- a/notes/import_timing_20180910.txt
+++ b/notes/import_timing_20180910.txt
@@ -257,3 +257,62 @@ Postgres really wanted to log: <https://wiki.postgresql.org/wiki/Per-user_log_se
     ALTER DATABASE fatcat_prod SET log_statement = 'ddl';
     ALTER USER fatcat SET log_statement = 'ddl';
 
+
+## Later Imports
+
+    zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+    => only ~40 TPS
+    => looks like almost all selects... probably skipping? huge postgres CPU churn, huh. missing index?
+
+           relname        | too_much_seq |      case      |  rel_size   | seq_scan | idx_scan 
+    ----------------------+--------------+----------------+-------------+----------+----------
+     file_rev             |     -5343173 | OK             |   867483648 |      693 |  5343866
+
+    EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '7d97e98f8af710c7e7fe703abc8f639e0ee507c4' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
+
+Ugh, this whole cycle again. Query only takes 1ms for a no-hit, or 5ms for success.
+
+    http get http://localhost:9411/v0/file/lookup?sha1=d77cb2a8b207507e0df27ba1393e8118eec9217f => not found
+    http get http://localhost:9411/v0/file/lookup?sha1=d6c3f6ca785ee63293da02615c8495c5cd25fa25 => found
+        => but no releases?
+
+SET auto_explain.log_min_duration = 0;
+SET auto_explain.log_analyze = true;
+ALTER USER fatcat SET log_statement = 'all';
+ALTER USER fatcat SET auto_explain.log_min_duration = 0;
+ALTER USER fatcat SET auto_explain.log_analyze = true;;
+sudo service fatcat-api restart
+
+
+2018-09-16 00:42:29.883 UTC [13868] fatcat@fatcat_prod LOG:  duration: 3028.143 ms  plan:
+        Query Text: SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = $1 AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = $2 AND "file_ident"."redirect_id" IS NULL LIMIT $3
+        Limit  (cost=0.43..8.56 rows=1 width=454) (actual time=3028.139..3028.139 rows=0 loops=1)
+          ->  Nested Loop  (cost=0.43..268756.48 rows=33056 width=454) (actual time=3028.138..3028.138 rows=0 loops=1)
+                ->  Seq Scan on file_rev  (cost=0.00..205023.16 rows=33042 width=405) (actual time=3028.137..3028.137 rows=0 loops=1)
+                      Filter: ((sha1 IS NOT NULL) AND ((sha1)::text = 'd68d4e2cc420647109cdfe410efed86cd64465e5'::text))
+                      Rows Removed by Filter: 6607786
+                ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..1.92 rows=1 width=49) (never executed)
+                      Index Cond: (rev_id = file_rev.id)
+                      Filter: (is_live AND (redirect_id IS NULL))
+
+
+fatcat_prod=# EXPLAIN SELECT "file_ident"."id", "file_ident"."is_live", "file_ident"."rev_id", "file_ident"."redirect_id", "file_rev"."id", "file_rev"."extra_json", "file_rev"."size", "file_rev"."sha1", "file_rev"."sha256", "file_rev"."md5", "file_rev"."mimetype" FROM ("file_ident" INNER JOIN "file_rev" ON "file_ident"."rev_id" = "file_rev"."id") WHERE "file_rev"."sha1" = '3f242a192acc258bdfdb15194341943222222222222440c313' AND "file_rev"."sha1" IS NOT NULL AND "file_ident"."is_live" = 't' AND "file_ident"."redirect_id" IS NULL LIMIT 1;
+                                                        QUERY PLAN                                                        
+--------------------------------------------------------------------------------------------------------------------------
+ Limit  (cost=0.99..5.04 rows=1 width=454)
+   ->  Nested Loop  (cost=0.99..5.04 rows=1 width=454)
+         ->  Index Scan using file_rev_sha1_idx on file_rev  (cost=0.56..2.58 rows=1 width=405)
+               Index Cond: ((sha1 IS NOT NULL) AND (sha1 = '3f242a192acc258bdfdb15194341943222222222222440c313'::bpchar))
+         ->  Index Scan using file_ident_rev_idx on file_ident  (cost=0.43..2.45 rows=1 width=49)
+               Index Cond: (rev_id = file_rev.id)
+               Filter: (is_live AND (redirect_id IS NULL))
+
+After all this debugging, asked on IRC and learned that I really shouldn't ever
+use CHAR() (or even VARCHAR()). Will update the schema.
+
+## Investigating Missing Rows
+
+    fatcat_prod=# select count(*) from release_ident; => 53028167
+
+Thought I had more than that previously? Table sizes were also bigger. Hrm.
+
diff --git a/notes/test_works.txt b/notes/test_works.txt
index 286b4d3a..d0663e76 100644
--- a/notes/test_works.txt
+++ b/notes/test_works.txt
@@ -33,3 +33,6 @@ Fulltext via CORE publisher-connector:
 
     10.1186/s12889-016-2706-9 
 
+## Missing
+
+"ACE: A Novel Software Platform to Ensure the Integrity [...]"
diff --git a/python/README_import.md b/python/README_import.md
index 38c8406f..d481b7ae 100644
--- a/python/README_import.md
+++ b/python/README_import.md
@@ -112,3 +112,8 @@ From compressed:
     sys     19m3.576s
 
 Really sped up once not contending with Crossref import, so don't run these two at the same time.
+
+## Matched
+
+    zcat /srv/datasets/2018-08-27-2352.17-matchcrossref.insertable.json.gz | pv -l | time parallel -j12 --round-robin --pipe ./fatcat_import.py import-matched -
+
diff --git a/rust/TODO b/rust/TODO
index d424c76c..c8a1a1bd 100644
--- a/rust/TODO
+++ b/rust/TODO
@@ -5,6 +5,7 @@ correctness
 - enforce "no editing if editgroup accepted" behavior
 - changelog sequence without gaps
 - batch insert editgroup behavior; always a new editgroup?
+- redirect rev_id needs to be updated when primary changes
 
 edit lifecycle
 - editgroup: state to track review status?
diff --git a/rust/migrations/2018-05-12-001226_init/up.sql b/rust/migrations/2018-05-12-001226_init/up.sql
index a6384f4e..d94e9b71 100644
--- a/rust/migrations/2018-05-12-001226_init/up.sql
+++ b/rust/migrations/2018-05-12-001226_init/up.sql
@@ -47,7 +47,7 @@ CREATE INDEX changelog_editgroup_idx ON changelog(editgroup_id);
 
 CREATE TABLE abstracts (
     -- fixed size hash (in hex). TODO: switch to bytes
-    sha1                TEXT PRIMARY KEY CHECK (octet_length(sha1) == 40),
+    sha1                TEXT PRIMARY KEY CHECK (octet_length(sha1) = 40),
     content             TEXT NOT NULL
 );
 
@@ -60,7 +60,7 @@ CREATE TABLE creator_rev (
     given_name          TEXT,
     surname             TEXT,
     -- fixed size identifier
-    orcid               TEXT CHECK(octet_length(orcid) == 19),
+    orcid               TEXT CHECK(octet_length(orcid) = 19),
     -- limited size for data quality
     wikidata_qid        TEXT CHECK(octet_length(wikidata_qid) <= 12)
 
@@ -103,7 +103,7 @@ CREATE TABLE container_rev (
     name                TEXT NOT NULL,
     publisher           TEXT,
     -- fixed size identifier
-    issnl               TEXT CHECK(octet_length(issnl) == 9),
+    issnl               TEXT CHECK(octet_length(issnl) = 9),
     -- limited size for data quality
     wikidata_qid        TEXT CHECK(octet_length(wikidata_qid) <= 12),
     abbrev              TEXT,
@@ -142,9 +142,9 @@ CREATE TABLE file_rev (
 
     size                BIGINT,
     -- fixed size hashes (in hex). TODO: switch to binary type type
-    sha1                TEXT CHECK(octet_length(sha1) == 40),
-    sha256              TEXT CHECK(octet_length(sha256) == 64),
-    md5                 TEXT CHECK(octet_length(md5) == 32),
+    sha1                TEXT CHECK(octet_length(sha1) = 40),
+    sha256              TEXT CHECK(octet_length(sha256) = 64),
+    md5                 TEXT CHECK(octet_length(md5) = 32),
     mimetype            TEXT
 );
 
@@ -198,7 +198,7 @@ CREATE TABLE release_rev (
     pmid                TEXT CHECK(octet_length(pmid) <= 12),
     pmcid               TEXT CHECK(octet_length(pmcid) <= 12),
     wikidata_qid        TEXT CHECK(octet_length(wikidata_qid) <= 12),
-    isbn13              TEXT CHECK(octet_length(isbn13) == 13),
+    isbn13              TEXT CHECK(octet_length(isbn13) = 13),
     core_id             TEXT CHECK(octet_length(core_id) <= 12),
     volume              TEXT,
     issue               TEXT,
diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs
index bc3f6c3a..d7bba7dc 100644
--- a/rust/src/database_schema.rs
+++ b/rust/src/database_schema.rs
@@ -1,6 +1,6 @@
 table! {
     abstracts (sha1) {
-        sha1 -> Bpchar,
+        sha1 -> Varchar,
         content -> Text,
     }
 }
@@ -41,10 +41,10 @@ table! {
         extra_json -> Nullable<Jsonb>,
         name -> Text,
         publisher -> Nullable<Text>,
-        issnl -> Nullable<Bpchar>,
-        wikidata_qid -> Nullable<Text>,
+        issnl -> Nullable<Varchar>,
+        wikidata_qid -> Nullable<Varchar>,
         abbrev -> Nullable<Text>,
-        coden -> Nullable<Text>,
+        coden -> Nullable<Varchar>,
     }
 }
 
@@ -77,8 +77,8 @@ table! {
         display_name -> Text,
         given_name -> Nullable<Text>,
         surname -> Nullable<Text>,
-        orcid -> Nullable<Bpchar>,
-        wikidata_qid -> Nullable<Text>,
+        orcid -> Nullable<Varchar>,
+        wikidata_qid -> Nullable<Varchar>,
     }
 }
 
@@ -136,9 +136,9 @@ table! {
         id -> Uuid,
         extra_json -> Nullable<Jsonb>,
         size -> Nullable<Int8>,
-        sha1 -> Nullable<Bpchar>,
-        sha256 -> Nullable<Bpchar>,
-        md5 -> Nullable<Bpchar>,
+        sha1 -> Nullable<Varchar>,
+        sha256 -> Nullable<Varchar>,
+        md5 -> Nullable<Varchar>,
         mimetype -> Nullable<Text>,
     }
 }
@@ -212,11 +212,11 @@ table! {
         release_status -> Nullable<Text>,
         release_date -> Nullable<Date>,
         doi -> Nullable<Text>,
-        pmid -> Nullable<Text>,
-        pmcid -> Nullable<Text>,
-        wikidata_qid -> Nullable<Text>,
-        isbn13 -> Nullable<Text>,
-        core_id -> Nullable<Text>,
+        pmid -> Nullable<Varchar>,
+        pmcid -> Nullable<Varchar>,
+        wikidata_qid -> Nullable<Varchar>,
+        isbn13 -> Nullable<Varchar>,
+        core_id -> Nullable<Varchar>,
         volume -> Nullable<Text>,
         issue -> Nullable<Text>,
         pages -> Nullable<Text>,
@@ -229,7 +229,7 @@ table! {
     release_rev_abstract (id) {
         id -> Int8,
         release_rev -> Uuid,
-        abstract_sha1 -> Bpchar,
+        abstract_sha1 -> Varchar,
         mimetype -> Nullable<Text>,
         lang -> Nullable<Text>,
     }
author	Bryan Newbold <bnewbold@robocracy.org>	2018-09-17 13:56:36 -0700
committer	Bryan Newbold <bnewbold@robocracy.org>	2018-09-17 13:56:36 -0700
commit	8baa2d8808797e423829fb040d70206d24b6156e (patch)
tree	62adfdd84118b0869fd9cc5a684d38d037a9f529
parent	5feeb70f930e733587ad7fa33d361cf24602abca (diff)
download	fatcat-8baa2d8808797e423829fb040d70206d24b6156e.tar.gz fatcat-8baa2d8808797e423829fb040d70206d24b6156e.zip