aboutsummaryrefslogtreecommitdiffstats
path: root/sql
diff options
context:
space:
mode:
Diffstat (limited to 'sql')
-rw-r--r--sql/migrations/00000000000000_diesel_initial_setup/down.sql6
-rw-r--r--sql/migrations/00000000000000_diesel_initial_setup/up.sql36
-rw-r--r--sql/migrations/2019-12-19-060141_init/down.sql8
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql106
l---------[-rw-r--r--]sql/sandcrawler_schema.sql71
5 files changed, 157 insertions, 70 deletions
diff --git a/sql/migrations/00000000000000_diesel_initial_setup/down.sql b/sql/migrations/00000000000000_diesel_initial_setup/down.sql
new file mode 100644
index 0000000..a9f5260
--- /dev/null
+++ b/sql/migrations/00000000000000_diesel_initial_setup/down.sql
@@ -0,0 +1,6 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+DROP FUNCTION IF EXISTS diesel_manage_updated_at(_tbl regclass);
+DROP FUNCTION IF EXISTS diesel_set_updated_at();
diff --git a/sql/migrations/00000000000000_diesel_initial_setup/up.sql b/sql/migrations/00000000000000_diesel_initial_setup/up.sql
new file mode 100644
index 0000000..d68895b
--- /dev/null
+++ b/sql/migrations/00000000000000_diesel_initial_setup/up.sql
@@ -0,0 +1,36 @@
+-- This file was automatically created by Diesel to setup helper functions
+-- and other internal bookkeeping. This file is safe to edit, any future
+-- changes will be added to existing projects as new migrations.
+
+
+
+
+-- Sets up a trigger for the given table to automatically set a column called
+-- `updated_at` whenever the row is modified (unless `updated_at` was included
+-- in the modified columns)
+--
+-- # Example
+--
+-- ```sql
+-- CREATE TABLE users (id SERIAL PRIMARY KEY, updated_at TIMESTAMP NOT NULL DEFAULT NOW());
+--
+-- SELECT diesel_manage_updated_at('users');
+-- ```
+CREATE OR REPLACE FUNCTION diesel_manage_updated_at(_tbl regclass) RETURNS VOID AS $$
+BEGIN
+ EXECUTE format('CREATE TRIGGER set_updated_at BEFORE UPDATE ON %s
+ FOR EACH ROW EXECUTE PROCEDURE diesel_set_updated_at()', _tbl);
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE OR REPLACE FUNCTION diesel_set_updated_at() RETURNS trigger AS $$
+BEGIN
+ IF (
+ NEW IS DISTINCT FROM OLD AND
+ NEW.updated_at IS NOT DISTINCT FROM OLD.updated_at
+ ) THEN
+ NEW.updated_at := current_timestamp;
+ END IF;
+ RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
diff --git a/sql/migrations/2019-12-19-060141_init/down.sql b/sql/migrations/2019-12-19-060141_init/down.sql
new file mode 100644
index 0000000..a085480
--- /dev/null
+++ b/sql/migrations/2019-12-19-060141_init/down.sql
@@ -0,0 +1,8 @@
+
+DROP TABLE IF NOT EXISTS cdx;
+DROP TABLE IF NOT EXISTS file_meta;
+DROP TABLE IF NOT EXISTS fatcat_file;
+DROP TABLE IF NOT EXISTS petabox;
+DROP TABLE IF NOT EXISTS grobid;
+DROP TABLE IF NOT EXISTS ingest_request;
+DROP TABLE IF NOT EXISTS shadow;
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
new file mode 100644
index 0000000..12ed409
--- /dev/null
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -0,0 +1,106 @@
+
+CREATE TABLE IF NOT EXISTS cdx (
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_csize BIGINT,
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ PRIMARY KEY(url, datetime)
+);
+CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+
+CREATE TABLE IF NOT EXISTS file_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+
+CREATE TABLE IF NOT EXISTS fatcat_file (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ file_ident TEXT CHECK (octet_length(file_ident) = 26),
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
+);
+
+CREATE TABLE IF NOT EXISTS petabox (
+ item TEXT NOT NULL CHECK (octet_length(item) >= 1),
+ path TEXT NOT NULL CHECK (octet_length(path) >= 1),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ PRIMARY KEY(item, path)
+);
+CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
+
+CREATE TABLE IF NOT EXISTS grobid (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+ fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
+ -- extracted basic biblio metadata:
+ -- title
+ -- authors[]
+ -- full/display
+ -- given_name
+ -- surname
+ -- affiliation
+ -- year
+ -- journal_issn
+ -- journal_name
+ -- refs_count
+ metadata JSONB
+);
+-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
+
+CREATE TABLE IF NOT EXISTS ingest_request (
+ link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
+ link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+
+ ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1),
+ created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ release_stage TEXT CHECK (octet_length(release_stage) >= 1),
+ request JSONB,
+ -- request isn't required, but can stash extra fields there for import, eg:
+ -- ext_ids (source/source_id sometimes enough)
+ -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN)
+ -- edit_extra
+
+ PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
+);
+CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+
+CREATE TABLE IF NOT EXISTS ingest_file_result (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(terminal_url) >= 1),
+ terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
+ terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
+ terminal_status_code INT,
+ terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40),
+
+ PRIMARY KEY (ingest_type, base_url)
+);
+CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
+CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
+
+CREATE TABLE IF NOT EXISTS shadow (
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(shadow_corpus, shadow_id)
+);
+CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
diff --git a/sql/sandcrawler_schema.sql b/sql/sandcrawler_schema.sql
index b6bc201..a3756d4 100644..120000
--- a/sql/sandcrawler_schema.sql
+++ b/sql/sandcrawler_schema.sql
@@ -1,70 +1 @@
-
-CREATE TABLE IF NOT EXISTS cdx (
- url TEXT NOT NULL CHECK (octet_length(url) >= 1),
- datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
- sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
- cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
- mimetype TEXT CHECK (octet_length(mimetype) >= 1),
- warc_path TEXT CHECK (octet_length(warc_path) >= 1),
- warc_csize BIGINT,
- warc_offset BIGINT,
- row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
- PRIMARY KEY(url, datetime)
-);
-CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
-CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
-
-CREATE TABLE IF NOT EXISTS file_meta (
- sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
- sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
- md5hex TEXT CHECK (octet_length(md5hex) = 32),
- size_bytes BIGINT,
- mimetype TEXT CHECK (octet_length(mimetype) >= 1)
-);
-
-CREATE TABLE IF NOT EXISTS fatcat_file (
- sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
- file_ident TEXT CHECK (octet_length(file_ident) = 26),
- first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
-);
-
-CREATE TABLE IF NOT EXISTS petabox (
- item TEXT NOT NULL CHECK (octet_length(item) >= 1),
- path TEXT NOT NULL CHECK (octet_length(path) >= 1),
- sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
- PRIMARY KEY(item, path)
-);
-CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
-
-CREATE TABLE IF NOT EXISTS grobid (
- sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
- updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
- grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
- status_code INT NOT NULL,
- status TEXT CHECK (octet_length(status) >= 1),
- fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
- -- extracted basic biblio metadata:
- -- title
- -- authors[]
- -- full/display
- -- given_name
- -- surname
- -- affiliation
- -- year
- -- journal_issn
- -- journal_name
- -- refs_count
- metadata JSONB
-);
--- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
-
-CREATE TABLE IF NOT EXISTS shadow (
- shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
- shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
- sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
- doi TEXT CHECK (octet_length(doi) >= 1),
- pmid TEXT CHECK (octet_length(pmid) >= 1),
- isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
- PRIMARY KEY(shadow_corpus, shadow_id)
-);
-CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
+migrations/2019-12-19-060141_init/up.sql \ No newline at end of file