From 9ca9ed84f0a404c3740a7ac33978bd9d6c6a0c4a Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Fri, 11 Jan 2019 19:43:07 -0800 Subject: first draft implementation of ref interning --- rust/src/database_models.rs | 29 +++++------ rust/src/database_schema.rs | 23 +++++---- rust/src/entity_crud.rs | 114 +++++++++++++++++++++++++++++--------------- 3 files changed, 102 insertions(+), 64 deletions(-) (limited to 'rust/src') diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs index 63fbcb29..47fd062d 100644 --- a/rust/src/database_models.rs +++ b/rust/src/database_models.rs @@ -376,6 +376,7 @@ pub struct ReleaseRevRow { pub extra_json: Option, pub work_ident_id: Uuid, pub container_ident_id: Option, + pub refs_blob_sha1: Option, pub title: String, pub release_type: Option, pub release_status: Option, @@ -400,6 +401,7 @@ pub struct ReleaseRevNewRow { pub extra_json: Option, pub work_ident_id: Uuid, pub container_ident_id: Option, + pub refs_blob_sha1: Option, pub title: String, pub release_type: Option, pub release_status: Option, @@ -491,33 +493,28 @@ pub struct ReleaseContribNewRow { pub extra_json: Option, } -#[derive(Debug, Queryable, Identifiable, Associations, AsChangeset)] +#[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "release_ref"] pub struct ReleaseRefRow { - pub id: i64, pub release_rev: Uuid, - pub target_release_ident_id: Option, - pub index_val: Option, - pub key: Option, - pub extra_json: Option, - pub container_name: Option, - pub year: Option, - pub title: Option, - pub locator: Option, + pub index_val: i32, + pub target_release_ident_id: Uuid, } -#[derive(Debug, Insertable, AsChangeset)] -#[table_name = "release_ref"] -pub struct ReleaseRefNewRow { - pub release_rev: Uuid, - pub target_release_ident_id: Option, - pub index_val: Option, +/* These fields now interned in JSON blob pub key: Option, pub extra_json: Option, pub container_name: Option, pub year: Option, pub title: Option, pub locator: Option, +*/ + +#[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] +#[table_name = "refs_blob"] +pub struct RefsBlobRow { + pub sha1: String, + pub refs_json: serde_json::Value, } #[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs index 3bc57d95..0a067a10 100644 --- a/rust/src/database_schema.rs +++ b/rust/src/database_schema.rs @@ -238,6 +238,13 @@ table! { } } +table! { + refs_blob (sha1) { + sha1 -> Text, + refs_json -> Jsonb, + } +} + table! { release_contrib (id) { id -> Int8, @@ -273,17 +280,10 @@ table! { } table! { - release_ref (id) { - id -> Int8, + release_ref (release_rev, index_val) { release_rev -> Uuid, - target_release_ident_id -> Nullable, - index_val -> Nullable, - key -> Nullable, - extra_json -> Nullable, - container_name -> Nullable, - year -> Nullable, - title -> Nullable, - locator -> Nullable, + index_val -> Int4, + target_release_ident_id -> Uuid, } } @@ -293,6 +293,7 @@ table! { extra_json -> Nullable, work_ident_id -> Uuid, container_ident_id -> Nullable, + refs_blob_sha1 -> Nullable, title -> Text, release_type -> Nullable, release_status -> Nullable, @@ -439,6 +440,7 @@ joinable!(release_ident -> release_rev (rev_id)); joinable!(release_ref -> release_ident (target_release_ident_id)); joinable!(release_ref -> release_rev (release_rev)); joinable!(release_rev -> container_ident (container_ident_id)); +joinable!(release_rev -> refs_blob (refs_blob_sha1)); joinable!(release_rev -> work_ident (work_ident_id)); joinable!(release_rev_abstract -> abstracts (abstract_sha1)); joinable!(release_rev_abstract -> release_rev (release_rev)); @@ -475,6 +477,7 @@ allow_tables_to_appear_in_same_query!( fileset_rev_file, fileset_rev_release, fileset_rev_url, + refs_blob, release_contrib, release_edit, release_ident, diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index ce1c1ed7..6f0f77aa 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -1812,28 +1812,26 @@ impl EntityCrud for ReleaseEntity { None => (None, None, None), }; - let refs: Option> = match hide.refs { - true => None, - false => Some( - release_ref::table + let refs: Option> = match (hide.refs, rev_row.refs_blob_sha1) { + (true, _) => None, + (false, None) => None, + (false, Some(sha1)) => Some({ + let refs_blob: RefsBlobRow = refs_blob::table + .find(sha1) // checked in match + .get_result(conn)?; + let mut refs: Vec = serde_json::from_value(refs_blob.refs_json)?; + let ref_rows: Vec = release_ref::table .filter(release_ref::release_rev.eq(rev_row.id)) .order(release_ref::index_val.asc()) - .get_results(conn)? - .into_iter() - .map(|r: ReleaseRefRow| ReleaseRef { - index: r.index_val.map(|v| v as i64), - key: r.key, - extra: r.extra_json, - container_name: r.container_name, - year: r.year.map(|v| v as i64), - title: r.title, - locator: r.locator, - target_release_id: r - .target_release_ident_id - .map(|v| FatcatId::from_uuid(&v).to_string()), - }) - .collect(), - ), + .get_results(conn)?; + for index in 0..refs.len() { + refs[index].index = Some(index as i64) + } + for row in ref_rows { + refs[row.index_val as usize].target_release_id = Some(FatcatId::from_uuid(&row.target_release_ident_id).to_string()); + } + refs + }), }; let contribs: Option> = match hide.contribs { @@ -1953,12 +1951,56 @@ impl EntityCrud for ReleaseEntity { .into()); } + // First, calculate and upsert any refs JSON blobs and record the SHA1 keys, so they can be + // included in the release_rev row itself + let mut refs_blob_rows: Vec = vec![]; + let mut refs_blob_sha1: Vec> = vec![]; + for model in models.iter() { + match &model.refs { + None => { + refs_blob_sha1.push(None); + }, + Some(ref_list) => { + // Have to strip out target refs and indexes, or hashing won't work well when + // these change + let ref_list: Vec = ref_list + .iter() + .map(|r| { + let mut r = r.clone(); + r.target_release_id = None; + r.index = None; + r + }) + .collect(); + // TODO: maybe `canonical_json` crate? + let refs_json = serde_json::to_value(ref_list)?; + let refs_str = refs_json.to_string(); + let sha1 = Sha1::from(refs_str).hexdigest(); + let blob = RefsBlobRow { sha1: sha1.clone(), refs_json }; + refs_blob_rows.push(blob); + refs_blob_sha1.push(Some(sha1)); + } + }; + } + + if !refs_blob_rows.is_empty() { + // Sort of an "upsert"; only inserts new abstract rows if they don't already exist + insert_into(refs_blob::table) + .values(&refs_blob_rows) + .on_conflict(refs_blob::sha1) + .do_nothing() + .execute(conn)?; + } + + // Then the main release_revs themselves let rev_ids: Vec = insert_into(release_rev::table) .values( models .iter() - .map(|model| { + .zip(refs_blob_sha1.into_iter()) + .map(|(model, refs_sha1)| { Ok(ReleaseRevNewRow { + refs_blob_sha1: refs_sha1, title: model.title.clone().unwrap(), // titles checked above release_type: model.release_type.clone(), release_status: model.release_status.clone(), @@ -1991,34 +2033,30 @@ impl EntityCrud for ReleaseEntity { .returning(release_rev::id) .get_results(conn)?; - let mut release_ref_rows: Vec = vec![]; + let mut release_ref_rows: Vec = vec![]; let mut release_contrib_rows: Vec = vec![]; let mut abstract_rows: Vec = vec![]; let mut release_abstract_rows: Vec = vec![]; for (model, rev_id) in models.iter().zip(rev_ids.iter()) { + + // We didn't know the release_rev id to insert here, so need to re-iterate over refs match &model.refs { None => (), Some(ref_list) => { - let these_ref_rows: Vec = ref_list + let these_ref_rows: Vec = ref_list .iter() - .map(|r| { - Ok(ReleaseRefNewRow { + .enumerate() + .filter(|(_, r)| r.target_release_id.is_some()) + .map(|(index, r)| { + Ok(ReleaseRefRow { release_rev: *rev_id, - target_release_ident_id: match r.target_release_id.clone() { - None => None, - Some(v) => Some(FatcatId::from_str(&v)?.to_uuid()), - }, - index_val: r.index.map(|v| v as i32), - key: r.key.clone(), - container_name: r.container_name.clone(), - year: r.year.map(|v| v as i32), - title: r.title.clone(), - locator: r.locator.clone(), - extra_json: r.extra.clone(), + // unwrap() checked by is_some() filter + target_release_ident_id: FatcatId::from_str(&r.target_release_id.clone().unwrap())?.to_uuid(), + index_val: index as i32, }) }) - .collect::>>()?; + .collect::>>()?; release_ref_rows.extend(these_ref_rows); } }; @@ -2053,7 +2091,7 @@ impl EntityCrud for ReleaseEntity { .iter() .filter(|ea| ea.content.is_some()) .map(|c| AbstractsRow { - sha1: Sha1::from(c.content.clone().unwrap()).hexdigest(), + sha1: Sha1::from(c.content.as_ref().unwrap()).hexdigest(), content: c.content.clone().unwrap(), }) .collect(); -- cgit v1.2.3 From b73deebe003306093913a8c62f2128917e181654 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Jan 2019 18:43:46 -0800 Subject: correct release refs return value when empty --- rust/src/entity_crud.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'rust/src') diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index 6f0f77aa..1147e117 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -1814,7 +1814,7 @@ impl EntityCrud for ReleaseEntity { let refs: Option> = match (hide.refs, rev_row.refs_blob_sha1) { (true, _) => None, - (false, None) => None, + (false, None) => Some(vec![]), (false, Some(sha1)) => Some({ let refs_blob: RefsBlobRow = refs_blob::table .find(sha1) // checked in match @@ -1961,6 +1961,10 @@ impl EntityCrud for ReleaseEntity { refs_blob_sha1.push(None); }, Some(ref_list) => { + if ref_list.is_empty() { + refs_blob_sha1.push(None); + continue + } // Have to strip out target refs and indexes, or hashing won't work well when // these change let ref_list: Vec = ref_list -- cgit v1.2.3 From 1888cea5544682a4d9084b1c1df36e363b32b861 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Jan 2019 19:03:27 -0800 Subject: add serde deps directly to fatcat crate --- rust/Cargo.lock | 3 +++ rust/Cargo.toml | 3 +++ rust/src/lib.rs | 2 ++ 3 files changed, 8 insertions(+) (limited to 'rust/src') diff --git a/rust/Cargo.lock b/rust/Cargo.lock index e89954ad..c0df5a2a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -538,6 +538,9 @@ dependencies = [ "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "sentry 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_ignored 0.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", "sha1 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", "slog 2.4.1 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 155e3c8a..c5a52845 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -37,6 +37,9 @@ slog = "^2.0" slog-term = "*" slog-async = "*" serde_json = "1.0" +serde = "*" +serde_derive = "1.0" +serde_ignored = "0.0.4" sentry = { version = "^0.12", default-features = false, features = ["with_client_implementation", "with_backtrace", "with_panic", "with_log", "with_rust_info", "with_failure"] } cadence = "^0.16" diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b7661334..d089adf8 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -9,6 +9,8 @@ extern crate log; extern crate lazy_static; #[macro_use] extern crate failure; +#[macro_use] +extern crate serde_derive; pub mod auth; pub mod database_models; -- cgit v1.2.3 From 70bc687e3fec738dbbfb632cb2be22cd5802b891 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Jan 2019 19:03:57 -0800 Subject: add interstitial JSON serde model to citation internment --- rust/src/database_models.rs | 92 ++++++++++++++++++++++++++++++++++++++++----- rust/src/entity_crud.rs | 9 +++-- 2 files changed, 87 insertions(+), 14 deletions(-) (limited to 'rust/src') diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs index 47fd062d..b76b469a 100644 --- a/rust/src/database_models.rs +++ b/rust/src/database_models.rs @@ -4,7 +4,7 @@ use crate::database_schema::*; use crate::errors::*; use crate::identifiers::uuid2fcid; use chrono; -use fatcat_api_spec::models::{ChangelogEntry, Editgroup, EditgroupAnnotation, Editor, EntityEdit}; +use fatcat_api_spec::models::{ChangelogEntry, Editgroup, EditgroupAnnotation, Editor, EntityEdit, ReleaseRef}; use serde_json; use uuid::Uuid; @@ -501,15 +501,6 @@ pub struct ReleaseRefRow { pub target_release_ident_id: Uuid, } -/* These fields now interned in JSON blob - pub key: Option, - pub extra_json: Option, - pub container_name: Option, - pub year: Option, - pub title: Option, - pub locator: Option, -*/ - #[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "refs_blob"] pub struct RefsBlobRow { @@ -517,6 +508,87 @@ pub struct RefsBlobRow { pub refs_json: serde_json::Value, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +/// This model is a stable representation of what goes in a RefsBlobRow `refs_json` field (an array +/// of this model). We could rely on the `ReleaseRef` API spec model directly, but that would lock +/// the database contents to the API spec rigidly; by defining this struct independently, we can +/// migrate the schemas. To start, this is a direct copy of the `ReleaseRef` model. +pub struct RefsBlobJson { + #[serde(rename = "index")] + #[serde(skip_serializing_if = "Option::is_none")] + pub index: Option, + + /// base32-encoded unique identifier + #[serde(rename = "target_release_id")] + #[serde(skip_serializing_if = "Option::is_none")] + pub target_release_id: Option, + + #[serde(rename = "extra")] + #[serde(skip_serializing_if = "Option::is_none")] + pub extra: Option, + + #[serde(rename = "key")] + #[serde(skip_serializing_if = "Option::is_none")] + pub key: Option, + + #[serde(rename = "year")] + #[serde(skip_serializing_if = "Option::is_none")] + pub year: Option, + + #[serde(rename = "container_name")] + #[serde(skip_serializing_if = "Option::is_none")] + pub container_name: Option, + + #[serde(rename = "title")] + #[serde(skip_serializing_if = "Option::is_none")] + pub title: Option, + + #[serde(rename = "locator")] + #[serde(skip_serializing_if = "Option::is_none")] + pub locator: Option, +} + +impl RefsBlobJson { + pub fn into_model(self) -> ReleaseRef { + ReleaseRef { + index: self.index, + target_release_id: self.target_release_id, + extra: self.extra, + key: self.key, + year: self.year, + container_name: self.container_name, + title: self.title, + locator: self.locator, + } + } + + pub fn to_model(&self) -> ReleaseRef { + ReleaseRef { + index: self.index, + target_release_id: self.target_release_id.clone(), + extra: self.extra.clone(), + key: self.key.clone(), + year: self.year, + container_name: self.container_name.clone(), + title: self.title.clone(), + locator: self.locator.clone(), + } + } + + pub fn from_model(model: &ReleaseRef) -> RefsBlobJson { + RefsBlobJson { + index: model.index, + target_release_id: model.target_release_id.clone(), + extra: model.extra.clone(), + key: model.key.clone(), + year: model.year, + container_name: model.container_name.clone(), + title: model.title.clone(), + locator: model.locator.clone(), + } + } +} + #[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "file_rev_release"] pub struct FileRevReleaseRow { diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index 1147e117..09ce9542 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -1819,7 +1819,8 @@ impl EntityCrud for ReleaseEntity { let refs_blob: RefsBlobRow = refs_blob::table .find(sha1) // checked in match .get_result(conn)?; - let mut refs: Vec = serde_json::from_value(refs_blob.refs_json)?; + let refs: Vec = serde_json::from_value(refs_blob.refs_json)?; + let mut refs: Vec = refs.into_iter().map(|j| j.into_model()).collect(); let ref_rows: Vec = release_ref::table .filter(release_ref::release_rev.eq(rev_row.id)) .order(release_ref::index_val.asc()) @@ -1967,10 +1968,10 @@ impl EntityCrud for ReleaseEntity { } // Have to strip out target refs and indexes, or hashing won't work well when // these change - let ref_list: Vec = ref_list + let ref_list: Vec = ref_list .iter() - .map(|r| { - let mut r = r.clone(); + .map(|r: &ReleaseRef| { + let mut r = RefsBlobJson::from_model(r); r.target_release_id = None; r.index = None; r -- cgit v1.2.3