diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-08-01 18:14:30 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2019-08-01 18:14:30 -0700 |
commit | 2a1e99c7fd000d4f49dff06361c5214f08f0d03f (patch) | |
tree | 5c458d414df95963d6a1eeed72e82d9319a09533 /postgresql/sandcrawler_schema.sql | |
parent | f32f2c6dcae0ea0fab6927264d10f73f942548ca (diff) | |
download | sandcrawler-2a1e99c7fd000d4f49dff06361c5214f08f0d03f.tar.gz sandcrawler-2a1e99c7fd000d4f49dff06361c5214f08f0d03f.zip |
start of postgres/postgrest notes and schema
Diffstat (limited to 'postgresql/sandcrawler_schema.sql')
-rw-r--r-- | postgresql/sandcrawler_schema.sql | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/postgresql/sandcrawler_schema.sql b/postgresql/sandcrawler_schema.sql new file mode 100644 index 0000000..9e5651d --- /dev/null +++ b/postgresql/sandcrawler_schema.sql @@ -0,0 +1,57 @@ + +CREATE TABLE cdx ( + id BIGSERIAL PRIMARY KEY, + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40), + url TEXT NOT NULL CHECK (octet_length(url) >= 1), + datetime TIMESTAMP WITH TIME ZONE NOT NULL, + mimetype TEXT CHECK (octet_length(mimetype) >= 1), + warc_path TEXT CHECK (octet_length(warc_path) >= 1), + warc_offset BIGINT, + row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL +); +CREATE INDEX cdx_sha1hex_idx ON cdx(sha1hex); +CREATE INDEX cdx_row_created_idx ON cdx(row_created); + +CREATE TABLE file_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + sha256hex TEXT CHECK (octet_length(sha256hex) = 64), + md5hex TEXT CHECK (octet_length(md5hex) = 32), + size_bytes BIGINT, + mimetype TEXT CHECK (octet_length(mimetype) >= 1) +); + +CREATE TABLE fatcat_file ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + file_ident TEXT CHECK (octet_length(file_ident) = 26), + first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26) +); + +CREATE TABLE petabox ( + id BIGSERIAL PRIMARY KEY, + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + item TEXT NOT NULL CHECK (octet_length(item) >= 1), + path TEXT NOT NULL CHECK (octet_length(path) >= 1) +); +CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex); + +CREATE TABLE grobid ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + grobid_version TEXT NOT NULL CHECK (octet_length(grobid_version) >= 1), + status_code INT NOT NULL, + status TEXT CHECK (octet_length(status) >= 1), + fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), + metadata JSONB +); +-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release); + +CREATE TABLE shadow ( + sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40), + shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1), + shadow_id TEXT CHECK (octet_length(shadow_id) >= 1), + doi TEXT CHECK (octet_length(doi) >= 1), + pmid TEXT CHECK (octet_length(pmid) >= 1), + isbn13 TEXT CHECK (octet_length(isbn13) >= 1), + PRIMARY KEY(sha1hex, shadow_corpus) +); |