From 9beb3caee51c6bb0403c658a71c965dde4c8e55b Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Wed, 18 Dec 2019 22:08:38 -0800 Subject: move SQL schema to diesel migration pattern --- .../00000000000000_diesel_initial_setup/down.sql | 6 ++ .../00000000000000_diesel_initial_setup/up.sql | 36 +++++++ sql/migrations/2019-12-19-060141_init/down.sql | 8 ++ sql/migrations/2019-12-19-060141_init/up.sql | 106 +++++++++++++++++++++ sql/sandcrawler_schema.sql | 71 +------------- 5 files changed, 157 insertions(+), 70 deletions(-) create mode 100644 sql/migrations/00000000000000_diesel_initial_setup/down.sql create mode 100644 sql/migrations/00000000000000_diesel_initial_setup/up.sql create mode 100644 sql/migrations/2019-12-19-060141_init/down.sql create mode 100644 sql/migrations/2019-12-19-060141_init/up.sql mode change 100644 => 120000 sql/sandcrawler_schema.sql (limited to 'sql') diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000..a9f5260 --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000..d68895b --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN + EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s + FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN + IF ( + NEW IS DISTINCT FROM OLD AND + NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at + ) THEN + NEW.updated_at := current_timestamp; + END IF; + RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql new file mode 100644 index 0000000..a085480 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/down.sql @@ -0,0 +1,8 @@ + +DROP TABLE IF NOT EXISTS cdx; +DROP TABLE IF NOT EXISTS file_meta; +DROP TABLE IF NOT EXISTS fatcat_file; +DROP TABLE IF NOT EXISTS petabox; +DROP TABLE IF NOT EXISTS grobid; +DROP TABLE IF NOT EXISTS ingest_request; +DROP TABLE IF NOT EXISTS shadow; diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql new file mode 100644 index 0000000..12ed409 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -0,0 +1,106 @@ + +CREATE TABLE IF NOT EXISTS cdx ( + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_csize BIGINT, + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + PRIMARY KEY(url, datetime) +); +CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); + +CREATE TABLE IF NOT EXISTS file_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT CHECK (octet_length(sha256hex) = 64), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); + +CREATE TABLE IF NOT EXISTS fatcat_file ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + file_ident TEXT CHECK (octet_length(file_ident) = 26), + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE IF NOT EXISTS petabox ( + item TEXT NOT NULL CHECK (octet_length(item) >= 1), + path TEXT NOT NULL CHECK (octet_length(path) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + PRIMARY KEY(item, path) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE IF NOT EXISTS grobid ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), + -- extracted basic biblio metadata: + -- title + -- authors[] + -- full/display + -- given_name + -- surname + -- affiliation + -- year + -- journal_issn + -- journal_name + -- refs_count + metadata JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE IF NOT EXISTS ingest_request ( + link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), + link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1), + created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + release_stage TEXT CHECK (octet_length(release_stage) >= 1), + request JSONB, + -- request isn't required, but can stash extra fields there for import, eg: + -- ext_ids (source/source_id sometimes enough) + -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) + -- edit_extra + + PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) +); +CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); + +CREATE TABLE IF NOT EXISTS ingest_file_result ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), + terminal_status_code INT, + terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40), + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); +CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); + +CREATE TABLE IF NOT EXISTS shadow ( + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(shadow_corpus, shadow_id) +); +CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql deleted file mode 100644 index b6bc201..0000000 --- a/sql/sandcrawler_schema.sql +++ /dev/null @@ -1,70 +0,0 @@ - -CREATE TABLE IF NOT EXISTS cdx ( - url TEXT NOT NULL CHECK (octet_length(url) >= 1), - datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), - sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), - cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), - mimetype TEXT CHECK (octet_length(mimetype) >= 1), - warc_path TEXT CHECK (octet_length(warc_path) >= 1), - warc_csize BIGINT, - warc_offset BIGINT, - row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, - PRIMARY KEY(url, datetime) -); -CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); -CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); - -CREATE TABLE IF NOT EXISTS file_meta ( - sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), - sha256hex TEXT CHECK (octet_length(sha256hex) = 64), - md5hex TEXT CHECK (octet_length(md5hex) = 32), - size_bytes BIGINT, - mimetype TEXT CHECK (octet_length(mimetype) >= 1) -); - -CREATE TABLE IF NOT EXISTS fatcat_file ( - sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), - file_ident TEXT CHECK (octet_length(file_ident) = 26), - first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) -); - -CREATE TABLE IF NOT EXISTS petabox ( - item TEXT NOT NULL CHECK (octet_length(item) >= 1), - path TEXT NOT NULL CHECK (octet_length(path) >= 1), - sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), - PRIMARY KEY(item, path) -); -CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); - -CREATE TABLE IF NOT EXISTS grobid ( - sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), - updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, - grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), - status_code INT NOT NULL, - status TEXT CHECK (octet_length(status) >= 1), - fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), - -- extracted basic biblio metadata: - -- title - -- authors[] - -- full/display - -- given_name - -- surname - -- affiliation - -- year - -- journal_issn - -- journal_name - -- refs_count - metadata JSONB -); --- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); - -CREATE TABLE IF NOT EXISTS shadow ( - shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), - shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), - sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), - doi TEXT CHECK (octet_length(doi) >= 1), - pmid TEXT CHECK (octet_length(pmid) >= 1), - isbn13 TEXT CHECK (octet_length(isbn13) >= 1), - PRIMARY KEY(shadow_corpus, shadow_id) -); -CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql new file mode 120000 index 0000000..a3756d4 --- /dev/null +++ b/sql/sandcrawler_schema.sql @@ -0,0 +1 @@ +migrations/2019-12-19-060141_init/up.sql \ No newline at end of file -- cgit v1.2.3