diff options
Diffstat (limited to 'sql/migrations/2019-12-19-060141_init/up.sql')
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 184 |
1 files changed, 184 insertions, 0 deletions
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql new file mode 100644 index 0000000..10a5183 --- /dev/null +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -0,0 +1,184 @@ + +-- rows *may* be revisit records; indicated by mimetype == "warc/revisit" +-- records are implied to be 200 status (or 226 for ftp); either direct hits or +-- revisits +-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many +-- datetimes. import scripts should take efforts to reduce this sort of +-- duplication though. one row per *domain*/sha1hex pair is a good guideline. +-- all ingest result url/dt pairs should be included though. +-- any mimetype is allowed, but presumption should be that actual body is full +-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each +-- only a part of work). URLs that are parts of a fileset are allowed. +CREATE TABLE IF NOT EXISTS cdx ( + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14), + -- sha1hex/cdx_sha1hex difference is intended to help with difference between + -- CDX hash (which is transport encoded body) vs. actual body. Probably need to + -- include both for all records? + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + -- TODO: enforce that only paths with '/' (item+file) should be included? + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_csize BIGINT, + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + PRIMARY KEY(url, datetime) +); +CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex); +-- TODO: remove this index? not currently used +CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created); + +-- TODO: require all fields. if mimetype unknown, should be octet-stream +CREATE TABLE IF NOT EXISTS file_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT CHECK (octet_length(sha256hex) = 64), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); +CREATE INDEX file_meta_md5hex_idx ON file_meta(md5hex); + +CREATE TABLE IF NOT EXISTS fatcat_file ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + file_ident TEXT CHECK (octet_length(file_ident) = 26), + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE IF NOT EXISTS petabox ( + item TEXT NOT NULL CHECK (octet_length(item) >= 1), + path TEXT NOT NULL CHECK (octet_length(path) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + PRIMARY KEY(item, path) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE IF NOT EXISTS grobid ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), + -- extracted basic biblio metadata: + -- title + -- authors[] + -- full/display + -- given_name + -- surname + -- affiliation + -- year + -- journal_issn + -- journal_name + -- refs_count + metadata JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE IF NOT EXISTS pdftrio ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + pdftrio_version TEXT CHECK (octet_length(pdftrio_version) >= 1), + models_date DATE, + ensemble_score REAL, + bert_score REAL, + linear_score REAL, + image_score REAL +); + +CREATE TABLE IF NOT EXISTS pdf_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_page0_thumbnail BOOLEAN NOT NULL, + page_count INT CHECK (page_count >= 0), + word_count INT CHECK (word_count >= 0), + page0_height REAL CHECK (page0_height >= 0), + page0_width REAL CHECK (page0_width >= 0), + permanent_id TEXT CHECK (octet_length(permanent_id) >= 1), + pdf_created TIMESTAMP WITH TIME ZONE, + pdf_version TEXT CHECK (octet_length(pdf_version) >= 1), + metadata JSONB + -- maybe some analysis of available fields? + -- metadata JSON fields: + -- title + -- subject + -- author + -- creator + -- producer + -- CrossMarkDomains + -- doi + -- form + -- encrypted +); + +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + scope TEXT CHECK (octet_length(status) >= 1), + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + biblio JSONB, + resources JSONB + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + +CREATE TABLE IF NOT EXISTS ingest_request ( + link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), + link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + ingest_request_source TEXT CHECK (octet_length(ingest_request_source) >= 1), + created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + release_stage TEXT CHECK (octet_length(release_stage) >= 1), + request JSONB, + -- request isn't required, but can stash extra fields there for import, eg: + -- ext_ids (source/source_id sometimes enough) + -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) + -- edit_extra + -- ingest type can be: pdf, xml, html + + PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) +); +CREATE INDEX ingest_request_base_url_idx ON ingest_request(base_url, ingest_type); + +CREATE TABLE IF NOT EXISTS ingest_file_result ( + ingest_type TEXT NOT NULL CHECK (octet_length(ingest_type) >= 1), + base_url TEXT NOT NULL CHECK (octet_length(base_url) >= 1), + + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + hit BOOLEAN NOT NULL, + status TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_url TEXT CHECK (octet_length(terminal_url) >= 1), + terminal_dt TEXT CHECK (octet_length(terminal_dt) = 14), + terminal_status_code INT, + terminal_sha1hex TEXT CHECK (octet_length(terminal_sha1hex) = 40), + + PRIMARY KEY (ingest_type, base_url) +); +CREATE INDEX ingest_file_result_terminal_url_idx ON ingest_file_result(terminal_url); +CREATE INDEX ingest_file_result_terminal_sha1hex_idx ON ingest_file_result(terminal_sha1hex); + +CREATE TABLE IF NOT EXISTS shadow ( + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT NOT NULL CHECK (octet_length(shadow_id) >= 1), + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(shadow_corpus, shadow_id) +); +CREATE INDEX shadow_sha1hex_idx ON shadow(sha1hex); + +CREATE TABLE IF NOT EXISTS crossref ( + doi TEXT NOT NULL CHECK (octet_length(doi) >= 4 AND doi = LOWER(doi)), + indexed TIMESTAMP WITH TIME ZONE NOT NULL, + record JSON NOT NULL, + PRIMARY KEY(doi) +); |