diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-17 10:44:38 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-01-17 10:44:38 -0800 |
commit | 24e7a7038f9f0c827a11282a5cc646117ffdfb9b (patch) | |
tree | 40227dbd1a2b652bad24c9d71f8027c20ef1b7d1 | |
parent | 3ad702e413c084590505cff16f9532b236718049 (diff) | |
parent | 0a8c9a5e07213276617f06b0379a166e7fd1c100 (diff) | |
download | fatcat-24e7a7038f9f0c827a11282a5cc646117ffdfb9b.tar.gz fatcat-24e7a7038f9f0c827a11282a5cc646117ffdfb9b.zip |
Merge branch 'citation-efficiency'
Manually merged conflict in:
- ./TODO
-rw-r--r-- | python/fatcat_web/auth.py | 5 | ||||
-rw-r--r-- | python/fatcat_web/templates/release_view.html | 2 | ||||
-rw-r--r-- | python/tests/citation_efficiency.py | 113 | ||||
-rw-r--r-- | rust/Cargo.lock | 3 | ||||
-rw-r--r-- | rust/Cargo.toml | 3 | ||||
-rw-r--r-- | rust/migrations/2019-01-01-000000_init/down.sql | 1 | ||||
-rw-r--r-- | rust/migrations/2019-01-01-000000_init/up.sql | 61 | ||||
-rw-r--r-- | rust/src/database_models.rs | 107 | ||||
-rw-r--r-- | rust/src/database_schema.rs | 23 | ||||
-rw-r--r-- | rust/src/entity_crud.rs | 119 | ||||
-rw-r--r-- | rust/src/lib.rs | 2 | ||||
-rw-r--r-- | rust/tests/test_refs.rs | 161 |
12 files changed, 505 insertions, 95 deletions
diff --git a/python/fatcat_web/auth.py b/python/fatcat_web/auth.py index 8035cbe5..03964c92 100644 --- a/python/fatcat_web/auth.py +++ b/python/fatcat_web/auth.py @@ -90,7 +90,10 @@ def handle_ia_xauth(email, password): 'secret': Config.IA_XAUTH_CLIENT_SECRET, }) if resp.status_code == 401 or (not resp.json().get('success')): - flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + try: + flash("Internet Archive email/password didn't match: {}".format(resp.json()['values']['reason'])) + except: + print("IA XAuth fail: {}".format(resp.content)) return render_template('auth_ia_login.html', email=email), resp.status_code elif resp.status_code != 200: flash("Internet Archive login failed (internal error?)") diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html index fd86b7c9..4e24b281 100644 --- a/python/fatcat_web/templates/release_view.html +++ b/python/fatcat_web/templates/release_view.html @@ -143,7 +143,7 @@ Raw Object: {% endif %} <br> -{% if release.refs.size != 0 %} +{% if release.refs != None and release.refs.size != 0 %} <h3>References</h3> This release citing other releases. <ol> diff --git a/python/tests/citation_efficiency.py b/python/tests/citation_efficiency.py new file mode 100644 index 00000000..fe5006cc --- /dev/null +++ b/python/tests/citation_efficiency.py @@ -0,0 +1,113 @@ + +import json +import pytest +from copy import copy + +from fatcat_client import * +from fatcat_client.rest import ApiException +from fixtures import * + + +def test_citation_indexing(api): + # indexing is consistent and reacts to change + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1.refs = [ + ReleaseRef(key="first", title="the first title"), + ReleaseRef(key="second", title="the second title"), + ReleaseRef(key="third", title="a third title"), + ] + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "second" + assert r1.refs[2].index == 2 + assert r1.refs[2].key == "third" + + r1.refs.pop(1) + eg = quick_eg(api) + api.update_release(r1.ident, r1, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r1 = api.get_release(r1.ident) + + assert r1.refs[0].index == 0 + assert r1.refs[0].key == "first" + assert r1.refs[1].index == 1 + assert r1.refs[1].key == "third" + +def test_citation_targets(api): + # invariant to linking citations + # also, updates work + + eg = quick_eg(api) + r1 = ReleaseEntity(title="the target") + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = ReleaseEntity(title="the citer") + r2.refs = [ + ReleaseRef(key="first", title="something else"), + ReleaseRef(key="second", title="the target title"), + ] + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + eg = quick_eg(api) + r2.refs[1].target_release_id = r1.ident + api.update_release(r2.ident, r2, editgroup_id=eg.editgroup_id) + api.accept_editgroup(eg.editgroup_id) + r2 = api.get_release(r2.ident) + assert r2.refs[0].key == "first" + assert r2.refs[1].key == "second" + assert r2.refs[0].index == 0 # TODO: one-indexing? + assert r2.refs[1].index == 1 + assert r2.refs[0].target_release_id == None + assert r2.refs[1].target_release_id == r1.ident + assert len(r2.refs) == 2 + +def test_citation_empty_array(api): + # distinction between empty array (no citations) and no array (hidden) + + r1 = ReleaseEntity(title="citation null") + r2 = ReleaseEntity(title="citation empty array") + r1.refs = None + r2.refs = [] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + r2 = api.get_release(api.create_release(r2, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + print(r1.refs) + print(r2.refs) + assert r1.refs == [] + assert r1.refs == r2.refs + + r1b = api.get_release(r1.ident, hide="refs") + assert r1b.refs == None + +def test_citation_encoding(api): + # escape-only changes (eg, \u1234 whatever for ASCII) + + r1 = ReleaseEntity(title="citation encoding") + title = "title-unicode \\u0050 \\\" " + container = "container-unicode ☃︎ ä ö ü スティー" + extra = extra={'a': 1, 'b': 2, 'ö': 3} + locator = "p123" + r1.refs = [ + ReleaseRef(key="1", year=1923, title=title, container_name=container, + extra=extra, locator=locator), + ReleaseRef(key="2"), + ] + + eg = quick_eg(api) + r1 = api.get_release(api.create_release(r1, editgroup_id=eg.editgroup_id).ident) + api.accept_editgroup(eg.editgroup_id) + + assert title == r1.refs[0].title + assert container == r1.refs[0].container_name + assert extra == r1.refs[0].extra + assert locator == r1.refs[0].locator + diff --git a/rust/Cargo.lock b/rust/Cargo.lock index e89954ad..c0df5a2a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -538,6 +538,9 @@ dependencies = [ "rand 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "sentry 0.12.1 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.84 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_ignored 0.0.4 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.34 (registry+https://github.com/rust-lang/crates.io-index)", "sha1 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)", "slog 2.4.1 (registry+https://github.com/rust-lang/crates.io-index)", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index 155e3c8a..c5a52845 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -37,6 +37,9 @@ slog = "^2.0" slog-term = "*" slog-async = "*" serde_json = "1.0" +serde = "*" +serde_derive = "1.0" +serde_ignored = "0.0.4" sentry = { version = "^0.12", default-features = false, features = ["with_client_implementation", "with_backtrace", "with_panic", "with_log", "with_rust_info", "with_failure"] } cadence = "^0.16" diff --git a/rust/migrations/2019-01-01-000000_init/down.sql b/rust/migrations/2019-01-01-000000_init/down.sql index 30e712e3..e238a690 100644 --- a/rust/migrations/2019-01-01-000000_init/down.sql +++ b/rust/migrations/2019-01-01-000000_init/down.sql @@ -2,6 +2,7 @@ -- in opposite order as up.sql DROP TABLE IF EXISTS release_contrib CASCADE; +DROP TABLE IF EXISTS refs_blob CASCADE; DROP TABLE IF EXISTS release_ref CASCADE; DROP TABLE IF EXISTS file_rev_release CASCADE; DROP TABLE IF EXISTS fileset_rev_release CASCADE; diff --git a/rust/migrations/2019-01-01-000000_init/up.sql b/rust/migrations/2019-01-01-000000_init/up.sql index b4c7a684..57f91d44 100644 --- a/rust/migrations/2019-01-01-000000_init/up.sql +++ b/rust/migrations/2019-01-01-000000_init/up.sql @@ -82,6 +82,12 @@ CREATE TABLE abstracts ( content TEXT NOT NULL ); +CREATE TABLE refs_blob ( + -- fixed size hash (in hex). TODO: switch to bytes + sha1 TEXT PRIMARY KEY CHECK (octet_length(sha1) = 40), + refs_json JSONB NOT NULL +); + -------------------- Creators ----------------------------------------------- CREATE TABLE creator_rev ( id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), @@ -326,6 +332,7 @@ CREATE TABLE release_rev ( work_ident_id UUID NOT NULL, -- FOREIGN KEY; see ALRTER below container_ident_id UUID REFERENCES container_ident(id), + refs_blob_sha1 TEXT REFERENCES refs_blob(sha1), title TEXT NOT NULL, release_type TEXT, -- TODO: enum release_status TEXT, -- TODO: enum @@ -436,20 +443,19 @@ CREATE INDEX release_contrib_rev_idx ON release_contrib(release_rev); CREATE INDEX release_contrib_creator_idx ON release_contrib(creator_ident_id); CREATE TABLE release_ref ( - id BIGSERIAL PRIMARY KEY, release_rev UUID REFERENCES release_rev(id) NOT NULL, - target_release_ident_id UUID REFERENCES release_ident(id), -- or work? - index_val INTEGER, - key TEXT, - extra_json JSONB, -- title, year, container_title, locator (aka, page), oci_id - container_name TEXT, - year INTEGER, - title TEXT, - locator TEXT - -- TODO: oci_id (TEXT) -); - -CREATE INDEX release_ref_rev_idx ON release_ref(release_rev); + index_val INTEGER NOT NULL, + target_release_ident_id UUID REFERENCES release_ident(id) NOT NULL, + -- all other fields are interned in refs_blob as JSONB + -- key TEXT, + -- extra_json JSONB, -- title, year, container_title, locator (aka, page), oci_id + -- container_name TEXT, + -- year INTEGER, + -- title TEXT, + -- locator TEXT + PRIMARY KEY(release_rev, index_val) +); + CREATE INDEX release_ref_target_release_idx ON release_ref(target_release_ident_id); CREATE TABLE file_rev_release ( @@ -632,10 +638,14 @@ INSERT INTO work_edit (ident_id, rev_id, redirect_id, editgroup_id, prev_rev) VA ('00000000-0000-0000-5555-000000000002', '00000000-0000-0000-5555-FFF000000002', null, '00000000-0000-0000-BBBB-000000000004', null), ('00000000-0000-0000-5555-000000000002', '00000000-0000-0000-5555-FFF000000003', null, '00000000-0000-0000-BBBB-000000000005', '00000000-0000-0000-5555-FFF000000002'); -INSERT INTO release_rev (id, work_ident_id, container_ident_id, title, release_type, release_status, release_date, release_year, doi, wikidata_qid, pmid, pmcid, isbn13, core_id, volume, issue, pages, publisher, language) VALUES - ('00000000-0000-0000-4444-FFF000000001', '00000000-0000-0000-5555-000000000001', null, 'example title', null, null, null, null, null, null, null, null, null, null, null, null, null, null, null), - ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-5555-000000000002', '00000000-0000-0000-1111-000000000001', 'bigger example', 'article-journal', null, '2018-01-01', 2018, '10.123/abc', 'Q55555', '54321', 'PMC555','978-3-16-148410-0', '42022773', '12', 'IV', '5-9', 'bogus publishing group', 'cn'), - ('00000000-0000-0000-4444-FFF000000003', '00000000-0000-0000-5555-000000000003', '00000000-0000-0000-1111-000000000003', 'Why Most Published Research Findings Are False', 'article-journal', 'published', '2005-08-30', 2005, '10.1371/journal.pmed.0020124', null, null, null, null, null, '2', '8', 'e124', 'Public Library of Science', 'en'); +INSERT INTO refs_blob (sha1, refs_json) VALUES + ('22222222c2979a62d29b18b537e50b2b093be27e', '[{}, {}, {}, {}, {"extra": {"unstructured":"citation note"}}]'), + ('33333333c2979a62d29b18b537e50b2b093be27e', '[{"extra": {"unstructured": "Ioannidis JP, Haidich AB, Lau J. Any casualties in the clash of randomised and observational evidence? BMJ. 2001;322:879–880"}}, {"extra": {"unstructured":"Lawlor DA, Davey Smith G, Kundu D, Bruckdorfer KR, Ebrahim S. Those confounded vitamins: What can we learn from the differences between observational versus randomised trial evidence? Lancet. 2004;363:1724–1727."}}, {"extra": {"unstructured":"Vandenbroucke JP. When are observational studies as credible as randomised trials? Lancet. 2004;363:1728–1731."}}, {"extra": {"unstructured":"Michiels S, Koscielny S, Hill C. Prediction of cancer outcome with microarrays: A multiple random validation strategy. Lancet. 2005;365:488–492."}}, {"extra": {"unstructured":"Ioannidis JPA, Ntzani EE, Trikalinos TA, Contopoulos-Ioannidis DG. Replication validity of genetic association studies. Nat Genet. 2001;29:306–309."}}, {"extra": {"unstructured":"Colhoun HM, McKeigue PM, Davey Smith G. Problems of reporting genetic associations with complex outcomes. Lancet. 2003;361:865–872."}}]'); + +INSERT INTO release_rev (id, work_ident_id, container_ident_id, title, release_type, release_status, release_date, release_year, doi, wikidata_qid, pmid, pmcid, isbn13, core_id, volume, issue, pages, publisher, language, refs_blob_sha1) VALUES + ('00000000-0000-0000-4444-FFF000000001', '00000000-0000-0000-5555-000000000001', null, 'example title', null, null, null, null, null, null, null, null, null, null, null, null, null, null, null, null), + ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-5555-000000000002', '00000000-0000-0000-1111-000000000001', 'bigger example', 'article-journal', null, '2018-01-01', 2018, '10.123/abc', 'Q55555', '54321', 'PMC555','978-3-16-148410-0', '42022773', '12', 'IV', '5-9', 'bogus publishing group', 'cn', '22222222c2979a62d29b18b537e50b2b093be27e'), + ('00000000-0000-0000-4444-FFF000000003', '00000000-0000-0000-5555-000000000003', '00000000-0000-0000-1111-000000000003', 'Why Most Published Research Findings Are False', 'article-journal', 'published', '2005-08-30', 2005, '10.1371/journal.pmed.0020124', null, null, null, null, null, '2', '8', 'e124', 'Public Library of Science', 'en', '33333333c2979a62d29b18b537e50b2b093be27e'); INSERT INTO release_ident (id, is_live, rev_id, redirect_id) VALUES ('00000000-0000-0000-4444-000000000001', true, '00000000-0000-0000-4444-FFF000000001', null), -- aaaaaaaaaaaaarceaaaaaaaaae @@ -656,15 +666,14 @@ INSERT INTO release_contrib (release_rev, creator_ident_id, raw_name, role, inde ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-2222-000000000002', 'some contrib', 'editor', 4), ('00000000-0000-0000-4444-FFF000000003', '00000000-0000-0000-2222-000000000003', 'John P. A. Ioannidis', 'author', 0); -INSERT INTO release_ref (release_rev, target_release_ident_id, index_val, extra_json) VALUES - ('00000000-0000-0000-4444-FFF000000002', null, null, null), - ('00000000-0000-0000-4444-FFF000000002', '00000000-0000-0000-4444-000000000001', 4, '{"unstructured":"citation note"}'), - ('00000000-0000-0000-4444-FFF000000003', null, 0, '{"unstructured": "Ioannidis JP, Haidich AB, Lau J. Any casualties in the clash of randomised and observational evidence? BMJ. 2001;322:879–880"}'), - ('00000000-0000-0000-4444-FFF000000003', null, 1, '{"unstructured":"Lawlor DA, Davey Smith G, Kundu D, Bruckdorfer KR, Ebrahim S. Those confounded vitamins: What can we learn from the differences between observational versus randomised trial evidence? Lancet. 2004;363:1724–1727."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 2, '{"unstructured":"Vandenbroucke JP. When are observational studies as credible as randomised trials? Lancet. 2004;363:1728–1731."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 3, '{"unstructured":"Michiels S, Koscielny S, Hill C. Prediction of cancer outcome with microarrays: A multiple random validation strategy. Lancet. 2005;365:488–492."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 4, '{"unstructured":"Ioannidis JPA, Ntzani EE, Trikalinos TA, Contopoulos-Ioannidis DG. Replication validity of genetic association studies. Nat Genet. 2001;29:306–309."}'), - ('00000000-0000-0000-4444-FFF000000003', null, 5, '{"unstructured":"Colhoun HM, McKeigue PM, Davey Smith G. Problems of reporting genetic associations with complex outcomes. Lancet. 2003;361:865–872."}'); +INSERT INTO release_ref (release_rev, index_val, target_release_ident_id) VALUES + ('00000000-0000-0000-4444-FFF000000002', 4, '00000000-0000-0000-4444-000000000001'), -- '{"unstructured":"citation note"}'), + ('00000000-0000-0000-4444-FFF000000003', 0, '00000000-0000-0000-4444-000000000001'), --'{"unstructured": "Ioannidis JP, Haidich AB, Lau J. Any casualties in the clash of randomised and observational evidence? BMJ. 2001;322:879–880"}'), + ('00000000-0000-0000-4444-FFF000000003', 1, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Lawlor DA, Davey Smith G, Kundu D, Bruckdorfer KR, Ebrahim S. Those confounded vitamins: What can we learn from the differences between observational versus randomised trial evidence? Lancet. 2004;363:1724–1727."}'), + ('00000000-0000-0000-4444-FFF000000003', 2, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Vandenbroucke JP. When are observational studies as credible as randomised trials? Lancet. 2004;363:1728–1731."}'), + ('00000000-0000-0000-4444-FFF000000003', 3, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Michiels S, Koscielny S, Hill C. Prediction of cancer outcome with microarrays: A multiple random validation strategy. Lancet. 2005;365:488–492."}'), + ('00000000-0000-0000-4444-FFF000000003', 4, '00000000-0000-0000-4444-000000000001'), --'{"unstructured":"Ioannidis JPA, Ntzani EE, Trikalinos TA, Contopoulos-Ioannidis DG. Replication validity of genetic association studies. Nat Genet. 2001;29:306–309."}'), + ('00000000-0000-0000-4444-FFF000000003', 5, '00000000-0000-0000-4444-000000000001'); --'{"unstructured":"Colhoun HM, McKeigue PM, Davey Smith G. Problems of reporting genetic associations with complex outcomes. Lancet. 2003;361:865–872."}'); INSERT INTO file_rev_release (file_rev, target_release_ident_id) VALUES ('00000000-0000-0000-3333-FFF000000002', '00000000-0000-0000-4444-000000000002'), diff --git a/rust/src/database_models.rs b/rust/src/database_models.rs index 63fbcb29..b76b469a 100644 --- a/rust/src/database_models.rs +++ b/rust/src/database_models.rs @@ -4,7 +4,7 @@ use crate::database_schema::*; use crate::errors::*; use crate::identifiers::uuid2fcid; use chrono; -use fatcat_api_spec::models::{ChangelogEntry, Editgroup, EditgroupAnnotation, Editor, EntityEdit}; +use fatcat_api_spec::models::{ChangelogEntry, Editgroup, EditgroupAnnotation, Editor, EntityEdit, ReleaseRef}; use serde_json; use uuid::Uuid; @@ -376,6 +376,7 @@ pub struct ReleaseRevRow { pub extra_json: Option<serde_json::Value>, pub work_ident_id: Uuid, pub container_ident_id: Option<Uuid>, + pub refs_blob_sha1: Option<String>, pub title: String, pub release_type: Option<String>, pub release_status: Option<String>, @@ -400,6 +401,7 @@ pub struct ReleaseRevNewRow { pub extra_json: Option<serde_json::Value>, pub work_ident_id: Uuid, pub container_ident_id: Option<Uuid>, + pub refs_blob_sha1: Option<String>, pub title: String, pub release_type: Option<String>, pub release_status: Option<String>, @@ -491,35 +493,102 @@ pub struct ReleaseContribNewRow { pub extra_json: Option<serde_json::Value>, } -#[derive(Debug, Queryable, Identifiable, Associations, AsChangeset)] +#[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "release_ref"] pub struct ReleaseRefRow { - pub id: i64, pub release_rev: Uuid, - pub target_release_ident_id: Option<Uuid>, - pub index_val: Option<i32>, - pub key: Option<String>, - pub extra_json: Option<serde_json::Value>, - pub container_name: Option<String>, - pub year: Option<i32>, - pub title: Option<String>, - pub locator: Option<String>, + pub index_val: i32, + pub target_release_ident_id: Uuid, } -#[derive(Debug, Insertable, AsChangeset)] -#[table_name = "release_ref"] -pub struct ReleaseRefNewRow { - pub release_rev: Uuid, - pub target_release_ident_id: Option<Uuid>, - pub index_val: Option<i32>, +#[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] +#[table_name = "refs_blob"] +pub struct RefsBlobRow { + pub sha1: String, + pub refs_json: serde_json::Value, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +/// This model is a stable representation of what goes in a RefsBlobRow `refs_json` field (an array +/// of this model). We could rely on the `ReleaseRef` API spec model directly, but that would lock +/// the database contents to the API spec rigidly; by defining this struct independently, we can +/// migrate the schemas. To start, this is a direct copy of the `ReleaseRef` model. +pub struct RefsBlobJson { + #[serde(rename = "index")] + #[serde(skip_serializing_if = "Option::is_none")] + pub index: Option<i64>, + + /// base32-encoded unique identifier + #[serde(rename = "target_release_id")] + #[serde(skip_serializing_if = "Option::is_none")] + pub target_release_id: Option<String>, + + #[serde(rename = "extra")] + #[serde(skip_serializing_if = "Option::is_none")] + pub extra: Option<serde_json::Value>, + + #[serde(rename = "key")] + #[serde(skip_serializing_if = "Option::is_none")] pub key: Option<String>, - pub extra_json: Option<serde_json::Value>, + + #[serde(rename = "year")] + #[serde(skip_serializing_if = "Option::is_none")] + pub year: Option<i64>, + + #[serde(rename = "container_name")] + #[serde(skip_serializing_if = "Option::is_none")] pub container_name: Option<String>, - pub year: Option<i32>, + + #[serde(rename = "title")] + #[serde(skip_serializing_if = "Option::is_none")] pub title: Option<String>, + + #[serde(rename = "locator")] + #[serde(skip_serializing_if = "Option::is_none")] pub locator: Option<String>, } +impl RefsBlobJson { + pub fn into_model(self) -> ReleaseRef { + ReleaseRef { + index: self.index, + target_release_id: self.target_release_id, + extra: self.extra, + key: self.key, + year: self.year, + container_name: self.container_name, + title: self.title, + locator: self.locator, + } + } + + pub fn to_model(&self) -> ReleaseRef { + ReleaseRef { + index: self.index, + target_release_id: self.target_release_id.clone(), + extra: self.extra.clone(), + key: self.key.clone(), + year: self.year, + container_name: self.container_name.clone(), + title: self.title.clone(), + locator: self.locator.clone(), + } + } + + pub fn from_model(model: &ReleaseRef) -> RefsBlobJson { + RefsBlobJson { + index: model.index, + target_release_id: model.target_release_id.clone(), + extra: model.extra.clone(), + key: model.key.clone(), + year: model.year, + container_name: model.container_name.clone(), + title: model.title.clone(), + locator: model.locator.clone(), + } + } +} + #[derive(Debug, Queryable, Insertable, Associations, AsChangeset)] #[table_name = "file_rev_release"] pub struct FileRevReleaseRow { diff --git a/rust/src/database_schema.rs b/rust/src/database_schema.rs index 3bc57d95..0a067a10 100644 --- a/rust/src/database_schema.rs +++ b/rust/src/database_schema.rs @@ -239,6 +239,13 @@ table! { } table! { + refs_blob (sha1) { + sha1 -> Text, + refs_json -> Jsonb, + } +} + +table! { release_contrib (id) { id -> Int8, release_rev -> Uuid, @@ -273,17 +280,10 @@ table! { } table! { - release_ref (id) { - id -> Int8, + release_ref (release_rev, index_val) { release_rev -> Uuid, - target_release_ident_id -> Nullable<Uuid>, - index_val -> Nullable<Int4>, - key -> Nullable<Text>, - extra_json -> Nullable<Jsonb>, - container_name -> Nullable<Text>, - year -> Nullable<Int4>, - title -> Nullable<Text>, - locator -> Nullable<Text>, + index_val -> Int4, + target_release_ident_id -> Uuid, } } @@ -293,6 +293,7 @@ table! { extra_json -> Nullable<Jsonb>, work_ident_id -> Uuid, container_ident_id -> Nullable<Uuid>, + refs_blob_sha1 -> Nullable<Text>, title -> Text, release_type -> Nullable<Text>, release_status -> Nullable<Text>, @@ -439,6 +440,7 @@ joinable!(release_ident -> release_rev (rev_id)); joinable!(release_ref -> release_ident (target_release_ident_id)); joinable!(release_ref -> release_rev (release_rev)); joinable!(release_rev -> container_ident (container_ident_id)); +joinable!(release_rev -> refs_blob (refs_blob_sha1)); joinable!(release_rev -> work_ident (work_ident_id)); joinable!(release_rev_abstract -> abstracts (abstract_sha1)); joinable!(release_rev_abstract -> release_rev (release_rev)); @@ -475,6 +477,7 @@ allow_tables_to_appear_in_same_query!( fileset_rev_file, fileset_rev_release, fileset_rev_url, + refs_blob, release_contrib, release_edit, release_ident, diff --git a/rust/src/entity_crud.rs b/rust/src/entity_crud.rs index ce1c1ed7..09ce9542 100644 --- a/rust/src/entity_crud.rs +++ b/rust/src/entity_crud.rs @@ -1812,28 +1812,27 @@ impl EntityCrud for ReleaseEntity { None => (None, None, None), }; - let refs: Option<Vec<ReleaseRef>> = match hide.refs { - true => None, - false => Some( - release_ref::table + let refs: Option<Vec<ReleaseRef>> = match (hide.refs, rev_row.refs_blob_sha1) { + (true, _) => None, + (false, None) => Some(vec![]), + (false, Some(sha1)) => Some({ + let refs_blob: RefsBlobRow = refs_blob::table + .find(sha1) // checked in match + .get_result(conn)?; + let refs: Vec<RefsBlobJson> = serde_json::from_value(refs_blob.refs_json)?; + let mut refs: Vec<ReleaseRef> = refs.into_iter().map(|j| j.into_model()).collect(); + let ref_rows: Vec<ReleaseRefRow> = release_ref::table .filter(release_ref::release_rev.eq(rev_row.id)) .order(release_ref::index_val.asc()) - .get_results(conn)? - .into_iter() - .map(|r: ReleaseRefRow| ReleaseRef { - index: r.index_val.map(|v| v as i64), - key: r.key, - extra: r.extra_json, - container_name: r.container_name, - year: r.year.map(|v| v as i64), - title: r.title, - locator: r.locator, - target_release_id: r - .target_release_ident_id - .map(|v| FatcatId::from_uuid(&v).to_string()), - }) - .collect(), - ), + .get_results(conn)?; + for index in 0..refs.len() { + refs[index].index = Some(index as i64) + } + for row in ref_rows { + refs[row.index_val as usize].target_release_id = Some(FatcatId::from_uuid(&row.target_release_ident_id).to_string()); + } + refs + }), }; let contribs: Option<Vec<ReleaseContrib>> = match hide.contribs { @@ -1953,12 +1952,60 @@ impl EntityCrud for ReleaseEntity { .into()); } + // First, calculate and upsert any refs JSON blobs and record the SHA1 keys, so they can be + // included in the release_rev row itself + let mut refs_blob_rows: Vec<RefsBlobRow> = vec![]; + let mut refs_blob_sha1: Vec<Option<String>> = vec![]; + for model in models.iter() { + match &model.refs { + None => { + refs_blob_sha1.push(None); + }, + Some(ref_list) => { + if ref_list.is_empty() { + refs_blob_sha1.push(None); + continue + } + // Have to strip out target refs and indexes, or hashing won't work well when + // these change + let ref_list: Vec<RefsBlobJson> = ref_list + .iter() + .map(|r: &ReleaseRef| { + let mut r = RefsBlobJson::from_model(r); + r.target_release_id = None; + r.index = None; + r + }) + .collect(); + // TODO: maybe `canonical_json` crate? + let refs_json = serde_json::to_value(ref_list)?; + let refs_str = refs_json.to_string(); + let sha1 = Sha1::from(refs_str).hexdigest(); + let blob = RefsBlobRow { sha1: sha1.clone(), refs_json }; + refs_blob_rows.push(blob); + refs_blob_sha1.push(Some(sha1)); + } + }; + } + + if !refs_blob_rows.is_empty() { + // Sort of an "upsert"; only inserts new abstract rows if they don't already exist + insert_into(refs_blob::table) + .values(&refs_blob_rows) + .on_conflict(refs_blob::sha1) + .do_nothing() + .execute(conn)?; + } + + // Then the main release_revs themselves let rev_ids: Vec<Uuid> = insert_into(release_rev::table) .values( models .iter() - .map(|model| { + .zip(refs_blob_sha1.into_iter()) + .map(|(model, refs_sha1)| { Ok(ReleaseRevNewRow { + refs_blob_sha1: refs_sha1, title: model.title.clone().unwrap(), // titles checked above release_type: model.release_type.clone(), release_status: model.release_status.clone(), @@ -1991,34 +2038,30 @@ impl EntityCrud for ReleaseEntity { .returning(release_rev::id) .get_results(conn)?; - let mut release_ref_rows: Vec<ReleaseRefNewRow> = vec![]; + let mut release_ref_rows: Vec<ReleaseRefRow> = vec![]; let mut release_contrib_rows: Vec<ReleaseContribNewRow> = vec![]; let mut abstract_rows: Vec<AbstractsRow> = vec![]; let mut release_abstract_rows: Vec<ReleaseRevAbstractNewRow> = vec![]; for (model, rev_id) in models.iter().zip(rev_ids.iter()) { + + // We didn't know the release_rev id to insert here, so need to re-iterate over refs match &model.refs { None => (), Some(ref_list) => { - let these_ref_rows: Vec<ReleaseRefNewRow> = ref_list + let these_ref_rows: Vec<ReleaseRefRow> = ref_list .iter() - .map(|r| { - Ok(ReleaseRefNewRow { + .enumerate() + .filter(|(_, r)| r.target_release_id.is_some()) + .map(|(index, r)| { + Ok(ReleaseRefRow { release_rev: *rev_id, - target_release_ident_id: match r.target_release_id.clone() { - None => None, - Some(v) => Some(FatcatId::from_str(&v)?.to_uuid()), - }, - index_val: r.index.map(|v| v as i32), - key: r.key.clone(), - container_name: r.container_name.clone(), - year: r.year.map(|v| v as i32), - title: r.title.clone(), - locator: r.locator.clone(), - extra_json: r.extra.clone(), + // unwrap() checked by is_some() filter + target_release_ident_id: FatcatId::from_str(&r.target_release_id.clone().unwrap())?.to_uuid(), + index_val: index as i32, }) }) - .collect::<Result<Vec<ReleaseRefNewRow>>>()?; + .collect::<Result<Vec<ReleaseRefRow>>>()?; release_ref_rows.extend(these_ref_rows); } }; @@ -2053,7 +2096,7 @@ impl EntityCrud for ReleaseEntity { .iter() .filter(|ea| ea.content.is_some()) .map(|c| AbstractsRow { - sha1: Sha1::from(c.content.clone().unwrap()).hexdigest(), + sha1: Sha1::from(c.content.as_ref().unwrap()).hexdigest(), content: c.content.clone().unwrap(), }) .collect(); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b7661334..d089adf8 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -9,6 +9,8 @@ extern crate log; extern crate lazy_static; #[macro_use] extern crate failure; +#[macro_use] +extern crate serde_derive; pub mod auth; pub mod database_models; diff --git a/rust/tests/test_refs.rs b/rust/tests/test_refs.rs new file mode 100644 index 00000000..a2bf37ba --- /dev/null +++ b/rust/tests/test_refs.rs @@ -0,0 +1,161 @@ + +use fatcat::identifiers::FatcatId; +use fatcat::server; +use std::str::FromStr; +use fatcat::database_models::*; +use fatcat::database_schema::*; +use fatcat::entity_crud::{EntityCrud, HideFlags}; +use fatcat::editing::{make_edit_context, accept_editgroup}; +use fatcat_api_spec::models::*; +use diesel::prelude::*; +use uuid::Uuid; + +mod helpers; + +#[test] +fn test_refs_blob() { + let server = server::create_test_server().unwrap(); + let conn = server.db_pool.get().expect("db_pool error"); + let editor_id = FatcatId::from_str(helpers::TEST_ADMIN_EDITOR_ID).unwrap(); + let editgroup_id = helpers::quick_editgroup(&conn); + let edit_context = make_edit_context(&conn, editor_id, Some(editgroup_id), false).unwrap(); + + // this release entity should be unchanged after being inserted/fetched + let mut r1 = ReleaseEntity::new(); + r1.title = Some("release-test hashes".to_string()); + r1.refs = Some(vec![ + ReleaseRef { + index: Some(0), + target_release_id: None, + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaai".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(2032), + container_name: Some("bogus other container".to_string()), + title: Some("second bogus paper".to_string()), + locator: Some("p200".to_string()), + }, + ]); + + // this release entity should have the same hash as r1. the indexes will change after fetching, + // but otherwise the fetched refs should be the same as the r1 fetched results. + let mut r2 = r1.clone(); + r2.refs = Some(vec![ + ReleaseRef { + index: None, + target_release_id: None, + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ReleaseRef { + index: Some(99), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaai".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(2032), + container_name: Some("bogus other container".to_string()), + title: Some("second bogus paper".to_string()), + locator: Some("p200".to_string()), + }, + ]); + + // this release entity has different ref *targets* and indexes, but should still have the same + // refs_blob hashes as r1/r2. + let mut r3 = r1.clone(); + r3.refs = Some(vec![ + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaae".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaam".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(2032), + container_name: Some("bogus other container".to_string()), + title: Some("second bogus paper".to_string()), + locator: Some("p200".to_string()), + }, + ]); + + // this one is obviously just plain different (hashes shouldn't match) + let mut r4 = r1.clone(); + r4.refs = Some(vec![ + ReleaseRef { + index: Some(1), + target_release_id: Some("aaaaaaaaaaaaarceaaaaaaaaae".to_string()), + extra: None, + key: Some("one".to_string()), + year: Some(1932), + container_name: Some("bogus container".to_string()), + title: Some("first bogus paper".to_string()), + locator: Some("p100".to_string()), + }, + ]); + + let edit1 = r1.db_create(&conn, &edit_context).unwrap(); + let edit2 = r2.db_create(&conn, &edit_context).unwrap(); + let edit3 = r3.db_create(&conn, &edit_context).unwrap(); + let edit4 = r4.db_create(&conn, &edit_context).unwrap(); + + let r1b = ReleaseEntity::db_get(&conn, edit1.ident_id.into(), HideFlags::none()).unwrap(); + let r2b = ReleaseEntity::db_get(&conn, edit2.ident_id.into(), HideFlags::none()).unwrap(); + let r3b = ReleaseEntity::db_get(&conn, edit3.ident_id.into(), HideFlags::none()).unwrap(); + let r4b = ReleaseEntity::db_get(&conn, edit4.ident_id.into(), HideFlags::none()).unwrap(); + assert_eq!(r1b.refs, r1.refs); + assert_eq!(r1b.refs, r2b.refs); + assert_ne!(r1b.refs, r3b.refs); + assert_ne!(r1b.refs, r4b.refs); + + let r1_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r1b.revision.clone().unwrap()).unwrap()) + .get_result(&conn).unwrap(); + let r2_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r2b.revision.unwrap()).unwrap()) + .get_result(&conn).unwrap(); + let r3_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r3b.revision.clone().unwrap()).unwrap()) + .get_result(&conn).unwrap(); + let r4_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r4b.revision.unwrap()).unwrap()) + .get_result(&conn).unwrap(); + assert_eq!(r1_row.refs_blob_sha1, r2_row.refs_blob_sha1); + assert_eq!(r1_row.refs_blob_sha1, r3_row.refs_blob_sha1); + assert_ne!(r1_row.refs_blob_sha1, r4_row.refs_blob_sha1); + + // ensure that SHA1 hashing is stable over time (as much as possible!) + assert_eq!(r1_row.refs_blob_sha1, Some("4e38812fbf99e00e0cb648896e9f7a9d58c5ab23".to_string())); + + // update r1 with new target_idents (r3); SHA1 row still shouldn't change + accept_editgroup(&conn, editgroup_id).unwrap(); + let editgroup_id = helpers::quick_editgroup(&conn); + let edit_context = make_edit_context(&conn, editor_id, Some(editgroup_id), false).unwrap(); + + let _edit4 = r3b.db_update(&conn, &edit_context, edit1.ident_id.into()).unwrap(); + let r1c = ReleaseEntity::db_get(&conn, edit1.ident_id.into(), HideFlags::none()).unwrap(); + let r1c_row: ReleaseRevRow = release_rev::table + .find(Uuid::from_str(&r1c.revision.unwrap()).unwrap()) + .get_result(&conn).unwrap(); + assert_eq!(r1_row.refs_blob_sha1, r1c_row.refs_blob_sha1); +} + |