aboutsummaryrefslogtreecommitdiffstats
path: root/sql/migrations/2019-12-19-060141_init/up.sql
diff options
context:
space:
mode:
Diffstat (limited to 'sql/migrations/2019-12-19-060141_init/up.sql')
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql72
1 files changed, 70 insertions, 2 deletions
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 73bd7f1..33dba66 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -42,7 +42,9 @@ CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex);
CREATE TABLE IF NOT EXISTS fatcat_file (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
file_ident TEXT CHECK (octet_length(file_ident) = 26),
- first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26),
+ any_url BOOLEAN,
+ content_scope TEXT CHECK (octet_length(content_scope) >= 1)
);
CREATE TABLE IF NOT EXISTS petabox (
@@ -147,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request (
PRIMARY KEY (link_source, link_source_id, ingest_type, base_url)
);
CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type);
+CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created);
CREATE TABLE IF NOT EXISTS ingest_file_result (
ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
@@ -154,7 +157,7 @@ CREATE TABLE IF NOT EXISTS ingest_file_result (
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
hit BOOLEAN NOT NULL,
- status TEXT CHECK (octet_length(terminal_url) >= 1),
+ status TEXT CHECK (octet_length(status) >= 1),
terminal_url TEXT CHECK (octet_length(terminal_url) >= 1),
terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14),
terminal_status_code INT,
@@ -165,6 +168,43 @@ CREATE TABLE IF NOT EXISTS ingest_file_result (
CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url);
CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex);
+CREATE TABLE IF NOT EXISTS ingest_fileset_platform (
+ ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1),
+ base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ hit BOOLEAN NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+
+ platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1),
+ platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1),
+ platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1),
+ ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1),
+ total_size BIGINT,
+ file_count BIGINT,
+ archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1),
+
+ archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1),
+ web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1),
+ web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14),
+
+ manifest JSONB,
+ -- list, similar to fatcat fileset manifest, plus extra:
+ -- status (str)
+ -- path (str)
+ -- size (int)
+ -- md5 (str)
+ -- sha1 (str)
+ -- sha256 (str)
+ -- mimetype (str)
+ -- extra (dict)
+ -- platform_url (str)
+ -- terminal_url (str)
+ -- terminal_dt (str)
+
+ PRIMARY KEY (ingest_type, base_url)
+);
+CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id);
+
CREATE TABLE IF NOT EXISTS shadow (
shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1),
@@ -175,3 +215,31 @@ CREATE TABLE IF NOT EXISTS shadow (
PRIMARY KEY(shadow_corpus, shadow_id)
);
CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex);
+
+CREATE TABLE IF NOT EXISTS crossref (
+ doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)),
+ indexed TIMESTAMP WITH TIME ZONE NOT NULL,
+ record JSON NOT NULL,
+ PRIMARY KEY(doi)
+);
+
+CREATE TABLE IF NOT EXISTS grobid_refs (
+ source TEXT NOT NULL CHECK (octet_length(source) >= 1),
+ source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1),
+ source_ts TIMESTAMP WITH TIME ZONE,
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ refs_json JSON NOT NULL,
+ PRIMARY KEY(source, source_id)
+);
+
+CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS
+ SELECT
+ crossref.doi as doi,
+ crossref.indexed as indexed,
+ crossref.record as record,
+ grobid_refs.source_ts as source_ts,
+ grobid_refs.refs_json as refs_json
+ FROM crossref
+ LEFT JOIN grobid_refs ON
+ grobid_refs.source_id = crossref.doi
+ AND grobid_refs.source = 'crossref';