diff options
Diffstat (limited to 'sql')
| -rw-r--r-- | sql/migrations/00000000000000_diesel_initial_setup/down.sql | 6 | ||||
| -rw-r--r-- | sql/migrations/00000000000000_diesel_initial_setup/up.sql | 36 | ||||
| -rw-r--r-- | sql/migrations/2019-12-19-060141_init/down.sql | 8 | ||||
| -rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 106 | ||||
| l---------[-rw-r--r--] | sql/sandcrawler_schema.sql | 71 | 
5 files changed, 157 insertions, 70 deletions
| diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql new file mode 100644 index 0000000..a9f5260 --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql @@ -0,0 +1,6 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + +DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass); +DROP FUNCTION IF EXISTS diesel_set_updated_at(); diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql new file mode 100644 index 0000000..d68895b --- /dev/null +++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql @@ -0,0 +1,36 @@ +-- This file was automatically created by Diesel to setup helper functions +-- and other internal bookkeeping. This file is safe to edit, any future +-- changes will be added to existing projects as new migrations. + + + + +-- Sets up a trigger for the given table to automatically set a column called +-- `updated_at` whenever the row is modified (unless `updated_at` was included +-- in the modified columns) +-- +-- # Example +-- +-- ```sql +-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW()); +-- +-- SELECT diesel_manage_updated_at('users'); +-- ``` +CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$ +BEGIN +    EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s +                    FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl); +END; +$$ LANGUAGE plpgsql; + +CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$ +BEGIN +    IF ( +        NEW IS DISTINCT FROM OLD AND +        NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at +    ) THEN +        NEW.updated_at := current_timestamp; +    END IF; +    RETURN NEW; +END; +$$ LANGUAGE plpgsql; diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql new file mode 100644 index 0000000..a085480 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/down.sql @@ -0,0 +1,8 @@ + +DROP TABLE IF NOT EXISTS cdx; +DROP TABLE IF NOT EXISTS file_meta; +DROP TABLE IF NOT EXISTS fatcat_file; +DROP TABLE IF NOT EXISTS petabox; +DROP TABLE IF NOT EXISTS grobid; +DROP TABLE IF NOT EXISTS ingest_request; +DROP TABLE IF NOT EXISTS shadow; diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql new file mode 100644 index 0000000..12ed409 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -0,0 +1,106 @@ + +CREATE TABLE IF NOT EXISTS cdx ( +    url                 TEXT NOT NULL CHECK (octet_length(url) >= 1), +    datetime            TEXT NOT NULL CHECK (octet_length(datetime) = 14), +    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), +    cdx_sha1hex         TEXT CHECK (octet_length(cdx_sha1hex) = 40), +    mimetype            TEXT CHECK (octet_length(mimetype) >= 1), +    warc_path           TEXT CHECK (octet_length(warc_path) >= 1), +    warc_csize          BIGINT, +    warc_offset         BIGINT, +    row_created         TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    PRIMARY KEY(url, datetime) +); +CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); + +CREATE TABLE IF NOT EXISTS file_meta ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    sha256hex           TEXT CHECK (octet_length(sha256hex) = 64), +    md5hex              TEXT CHECK (octet_length(md5hex) = 32), +    size_bytes          BIGINT, +    mimetype            TEXT CHECK (octet_length(mimetype) >= 1) +); + +CREATE TABLE IF NOT EXISTS fatcat_file ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    file_ident          TEXT CHECK (octet_length(file_ident) = 26), +    first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE IF NOT EXISTS petabox ( +    item                TEXT NOT NULL CHECK (octet_length(item) >= 1), +    path                TEXT NOT NULL CHECK (octet_length(path) >= 1), +    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), +    PRIMARY KEY(item, path) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE IF NOT EXISTS grobid ( +    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), +    updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    grobid_version      TEXT CHECK (octet_length(grobid_version) >= 1), +    status_code         INT NOT NULL, +    status              TEXT CHECK (octet_length(status) >= 1), +    fatcat_release      TEXT CHECK (octet_length(fatcat_release) = 26), +    -- extracted basic biblio metadata: +    --  title +    --  authors[] +    --    full/display +    --    given_name +    --    surname +    --    affiliation +    --  year +    --  journal_issn +    --  journal_name +    --  refs_count +    metadata            JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE IF NOT EXISTS ingest_request ( +    link_source             TEXT NOT NULL CHECK (octet_length(link_source) >= 1), +    link_source_id          TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), +    ingest_type             TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), +    base_url                TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + +    ingest_request_source   TEXT CHECK (octet_length(ingest_request_source) >= 1), +    created                 TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    release_stage           TEXT CHECK (octet_length(release_stage) >= 1), +    request                 JSONB, +    -- request isn't required, but can stash extra fields there for import, eg: +    --   ext_ids (source/source_id sometimes enough) +    --   fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) +    --   edit_extra + +    PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) +); +CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); + +CREATE TABLE IF NOT EXISTS ingest_file_result ( +    ingest_type             TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), +    base_url                TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + +    updated                 TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, +    hit                     BOOLEAN NOT NULL, +    status                  TEXT CHECK (octet_length(terminal_url) >= 1), +    terminal_url            TEXT CHECK (octet_length(terminal_url) >= 1), +    terminal_dt             TEXT CHECK (octet_length(terminal_dt) = 14), +    terminal_status_code    INT, +    terminal_sha1hex        TEXT CHECK (octet_length(terminal_sha1hex) = 40), + +    PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); +CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); + +CREATE TABLE IF NOT EXISTS shadow ( +    shadow_corpus       TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), +    shadow_id           TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), +    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), +    doi                 TEXT CHECK (octet_length(doi) >= 1), +    pmid                TEXT CHECK (octet_length(pmid) >= 1), +    isbn13              TEXT CHECK (octet_length(isbn13) >= 1), +    PRIMARY KEY(shadow_corpus, shadow_id) +); +CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql index b6bc201..a3756d4 100644..120000 --- a/sql/sandcrawler_schema.sql +++ b/sql/sandcrawler_schema.sql @@ -1,70 +1 @@ - -CREATE TABLE IF NOT EXISTS cdx ( -    url                 TEXT NOT NULL CHECK (octet_length(url) >= 1), -    datetime            TEXT NOT NULL CHECK (octet_length(datetime) = 14), -    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), -    cdx_sha1hex         TEXT CHECK (octet_length(cdx_sha1hex) = 40), -    mimetype            TEXT CHECK (octet_length(mimetype) >= 1), -    warc_path           TEXT CHECK (octet_length(warc_path) >= 1), -    warc_csize          BIGINT, -    warc_offset         BIGINT, -    row_created         TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, -    PRIMARY KEY(url, datetime) -); -CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); -CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); - -CREATE TABLE IF NOT EXISTS file_meta ( -    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), -    sha256hex           TEXT CHECK (octet_length(sha256hex) = 64), -    md5hex              TEXT CHECK (octet_length(md5hex) = 32), -    size_bytes          BIGINT, -    mimetype            TEXT CHECK (octet_length(mimetype) >= 1) -); - -CREATE TABLE IF NOT EXISTS fatcat_file ( -    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), -    file_ident          TEXT CHECK (octet_length(file_ident) = 26), -    first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) -); - -CREATE TABLE IF NOT EXISTS petabox ( -    item                TEXT NOT NULL CHECK (octet_length(item) >= 1), -    path                TEXT NOT NULL CHECK (octet_length(path) >= 1), -    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), -    PRIMARY KEY(item, path) -); -CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); - -CREATE TABLE IF NOT EXISTS grobid ( -    sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), -    updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, -    grobid_version      TEXT CHECK (octet_length(grobid_version) >= 1), -    status_code         INT NOT NULL, -    status              TEXT CHECK (octet_length(status) >= 1), -    fatcat_release      TEXT CHECK (octet_length(fatcat_release) = 26), -    -- extracted basic biblio metadata: -    --  title -    --  authors[] -    --    full/display -    --    given_name -    --    surname -    --    affiliation -    --  year -    --  journal_issn -    --  journal_name -    --  refs_count -    metadata            JSONB -); --- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); - -CREATE TABLE IF NOT EXISTS shadow ( -    shadow_corpus       TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), -    shadow_id           TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), -    sha1hex             TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), -    doi                 TEXT CHECK (octet_length(doi) >= 1), -    pmid                TEXT CHECK (octet_length(pmid) >= 1), -    isbn13              TEXT CHECK (octet_length(isbn13) >= 1), -    PRIMARY KEY(shadow_corpus, shadow_id) -); -CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); +migrations/2019-12-19-060141_init/up.sql
\ No newline at end of file | 
