aboutsummaryrefslogtreecommitdiffstats
path: root/postgresql/sandcrawler_schema.sql
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-08-01 18:14:30 -0700
committerBryan Newbold <bnewbold@archive.org>2019-08-01 18:14:30 -0700
commit2a1e99c7fd000d4f49dff06361c5214f08f0d03f (patch)
tree5c458d414df95963d6a1eeed72e82d9319a09533 /postgresql/sandcrawler_schema.sql
parentf32f2c6dcae0ea0fab6927264d10f73f942548ca (diff)
downloadsandcrawler-2a1e99c7fd000d4f49dff06361c5214f08f0d03f.tar.gz
sandcrawler-2a1e99c7fd000d4f49dff06361c5214f08f0d03f.zip
start of postgres/postgrest notes and schema
Diffstat (limited to 'postgresql/sandcrawler_schema.sql')
-rw-r--r--postgresql/sandcrawler_schema.sql57
1 files changed, 57 insertions, 0 deletions
diff --git a/postgresql/sandcrawler_schema.sql b/postgresql/sandcrawler_schema.sql
new file mode 100644
index 0000000..9e5651d
--- /dev/null
+++ b/postgresql/sandcrawler_schema.sql
@@ -0,0 +1,57 @@
+
+CREATE TABLE cdx (
+ id BIGSERIAL PRIMARY KEY,
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
+ url TEXT NOT NULL CHECK (octet_length(url) >= 1),
+ datetime TIMESTAMP WITH TIME ZONE NOT NULL,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ warc_path TEXT CHECK (octet_length(warc_path) >= 1),
+ warc_offset BIGINT,
+ row_created TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL
+);
+CREATE INDEX cdx_sha1hex_idx ON cdx(sha1hex);
+CREATE INDEX cdx_row_created_idx ON cdx(row_created);
+
+CREATE TABLE file_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ sha256hex TEXT CHECK (octet_length(sha256hex) = 64),
+ md5hex TEXT CHECK (octet_length(md5hex) = 32),
+ size_bytes BIGINT,
+ mimetype TEXT CHECK (octet_length(mimetype) >= 1)
+);
+
+CREATE TABLE fatcat_file (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ file_ident TEXT CHECK (octet_length(file_ident) = 26),
+ first_release_ident TEXT CHECK (octet_length(first_release_ident) = 26)
+);
+
+CREATE TABLE petabox (
+ id BIGSERIAL PRIMARY KEY,
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ item TEXT NOT NULL CHECK (octet_length(item) >= 1),
+ path TEXT NOT NULL CHECK (octet_length(path) >= 1)
+);
+CREATE INDEX petabox_sha1hex_idx ON petabox(sha1hex);
+
+CREATE TABLE grobid (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ grobid_version TEXT NOT NULL CHECK (octet_length(grobid_version) >= 1),
+ status_code INT NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1),
+ fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),
+ metadata JSONB
+);
+-- CREATE INDEX grobid_fatcat_release_idx ON grobid(fatcat_release);
+
+CREATE TABLE shadow (
+ sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
+ shadow_corpus TEXT NOT NULL CHECK (octet_length(shadow_corpus) >= 1),
+ shadow_id TEXT CHECK (octet_length(shadow_id) >= 1),
+ doi TEXT CHECK (octet_length(doi) >= 1),
+ pmid TEXT CHECK (octet_length(pmid) >= 1),
+ isbn13 TEXT CHECK (octet_length(isbn13) >= 1),
+ PRIMARY KEY(sha1hex, shadow_corpus)
+);