diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 11:26:16 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-03 11:26:16 -0800 |
commit | 806967ca168bcdbf2e57699703904333c21d4a2f (patch) | |
tree | b63b24666d9da5fda01a5ff6bd824d09d0deb950 /sql | |
parent | 206c4f6ba10417cfb463be9101d71291bd0e458c (diff) | |
download | sandcrawler-806967ca168bcdbf2e57699703904333c21d4a2f.tar.gz sandcrawler-806967ca168bcdbf2e57699703904333c21d4a2f.zip |
html: start on SQL table
Diffstat (limited to 'sql')
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 15 |
1 files changed, 15 insertions, 0 deletions
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 59423dd..6a8c52b 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -114,6 +114,20 @@ CREATE TABLE IF NOT EXISTS pdf_meta ( -- encrypted ); +CREATE TABLE IF NOT EXISTS html_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_teixml BOOLEAN NOT NULL, + has_thumbnail BOOLEAN NOT NULL, + word_count INT CHECK (word_count >= 0), + resource_count INT CHECK (resource_count >= 0), + biblio JSONB, + resources JSONB, + -- biblio JSON fields are similar to fatcat release schema + -- resources JSON object is a list of objects with keys like webcapture CDX schema +); + CREATE TABLE IF NOT EXISTS ingest_request ( link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), @@ -128,6 +142,7 @@ CREATE TABLE IF NOT EXISTS ingest_request ( -- ext_ids (source/source_id sometimes enough) -- fatcat_release (if ext_ids and source/source_id not specific enough; eg SPN) -- edit_extra + -- ingest type can be: pdf, xml, html PRIMARY KEY (link_source, link_source_id, ingest_type, base_url) ); |