diff options
Diffstat (limited to 'sql/migrations/2019-12-19-060141_init/up.sql')
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 72 |
1 files changed, 70 insertions, 2 deletions
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 73bd7f1..33dba66 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -42,7 +42,9 @@ CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); CREATE TABLE IF NOT EXISTS fatcat_file ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), file_ident TEXT CHECK (octet_length(file_ident) = 26), - first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26), + any_url BOOLEAN, + content_scope TEXT CHECK (octet_length(content_scope) >= 1) ); CREATE TABLE IF NOT EXISTS petabox ( @@ -147,6 +149,7 @@ CREATE TABLE IF NOT EXISTS ingest_request ( PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) ); CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); +CREATE INDEX ingest_request_source_created_idx ON ingest_request(ingest_request_source, created); CREATE TABLE IF NOT EXISTS ingest_file_result ( ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), @@ -154,7 +157,7 @@ CREATE TABLE IF NOT EXISTS ingest_file_result ( updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, hit BOOLEAN NOT NULL, - status TEXT CHECK (octet_length(terminal_url) >= 1), + status TEXT CHECK (octet_length(status) >= 1), terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), terminal_status_code INT, @@ -165,6 +168,43 @@ CREATE TABLE IF NOT EXISTS ingest_file_result ( CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); +CREATE TABLE IF NOT EXISTS ingest_fileset_platform ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + + platform_name TEXT NOT NULL CHECK (octet_length(platform_name) >= 1), + platform_domain TEXT NOT NULL CHECK (octet_length(platform_domain) >= 1), + platform_id TEXT NOT NULL CHECK (octet_length(platform_id) >= 1), + ingest_strategy TEXT CHECK (octet_length(ingest_strategy) >= 1), + total_size BIGINT, + file_count BIGINT, + archiveorg_item_name TEXT CHECK (octet_length(archiveorg_item_name) >= 1), + + archiveorg_item_bundle_path TEXT CHECK (octet_length(archiveorg_item_bundle_path) >= 1), + web_bundle_url TEXT CHECK (octet_length(web_bundle_url) >= 1), + web_bundle_dt TEXT CHECK (octet_length(web_bundle_dt) = 14), + + manifest JSONB, + -- list, similar to fatcat fileset manifest, plus extra: + -- status (str) + -- path (str) + -- size (int) + -- md5 (str) + -- sha1 (str) + -- sha256 (str) + -- mimetype (str) + -- extra (dict) + -- platform_url (str) + -- terminal_url (str) + -- terminal_dt (str) + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_fileset_platform_name_domain_id_idx ON ingest_fileset_platform(platform_name, platform_domain, platform_id); + CREATE TABLE IF NOT EXISTS shadow ( shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), @@ -175,3 +215,31 @@ CREATE TABLE IF NOT EXISTS shadow ( PRIMARY KEY(shadow_corpus, shadow_id) ); CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); + +CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) +); + +CREATE TABLE IF NOT EXISTS grobid_refs ( + source TEXT NOT NULL CHECK (octet_length(source) >= 1), + source_id TEXT NOT NULL CHECK (octet_length(source_id) >= 1), + source_ts TIMESTAMP WITH TIME ZONE, + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + refs_json JSON NOT NULL, + PRIMARY KEY(source, source_id) +); + +CREATE OR REPLACE VIEW crossref_with_refs (doi, indexed, record, source_ts, refs_json) AS + SELECT + crossref.doi as doi, + crossref.indexed as indexed, + crossref.record as record, + grobid_refs.source_ts as source_ts, + grobid_refs.refs_json as refs_json + FROM crossref + LEFT JOIN grobid_refs ON + grobid_refs.source_id = crossref.doi + AND grobid_refs.source = 'crossref'; |