diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 21:21:19 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-17 21:21:19 -0700 |
commit | cc790c7c568a819a6a73fe788795c333cedbe109 (patch) | |
tree | 645b1ddf880b26e1af380862d7f8e5e37ec36a60 | |
parent | e1806a3233718ab990955d659062c97ca5679302 (diff) | |
download | sandcrawler-cc790c7c568a819a6a73fe788795c333cedbe109.tar.gz sandcrawler-cc790c7c568a819a6a73fe788795c333cedbe109.zip |
tweak pdf_meta SQL schema
-rw-r--r-- | proposals/2020_pdf_meta_thumbnails.md | 10 | ||||
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 26 |
2 files changed, 31 insertions, 5 deletions
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md index eacbfa5..793d6b5 100644 --- a/proposals/2020_pdf_meta_thumbnails.md +++ b/proposals/2020_pdf_meta_thumbnails.md @@ -45,15 +45,15 @@ Kafka, and we don't want SQL table size to explode. Schema: sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, status TEXT CHECK (octet_length(status) >= 1) NOT NULL, - page0_thumbnail BOOLEAN NOT NULL, + has_page0_thumbnail BOOLEAN NOT NULL, page_count INT CHECK (page_count >= 0), word_count INT CHECK (word_count >= 0), - page0_height FLOAT CHECK (page0_height >= 0), - page0_width FLOAT CHECK (page0_width >= 0), + page0_height REAL CHECK (page0_height >= 0), + page0_width REAL CHECK (page0_width >= 0), permanent_id TEXT CHECK (octet_length(permanent_id) >= 1), - creation date TIMESTAMP WITH TIME ZONE, + pdf_created TIMESTAMP WITH TIME ZONE, pdf_version TEXT CHECK (octet_length(pdf_version) >= 1), - metadata JSONB; + metadata JSONB -- maybe some analysis of available fields? -- metadata JSON fields: -- title diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 688487f..59423dd 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -88,6 +88,32 @@ CREATE TABLE IF NOT EXISTS pdftrio ( image_score REAL ); +CREATE TABLE IF NOT EXISTS pdf_meta ( + sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), + updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, + status TEXT CHECK (octet_length(status) >= 1) NOT NULL, + has_page0_thumbnail BOOLEAN NOT NULL, + page_count INT CHECK (page_count >= 0), + word_count INT CHECK (word_count >= 0), + page0_height REAL CHECK (page0_height >= 0), + page0_width REAL CHECK (page0_width >= 0), + permanent_id TEXT CHECK (octet_length(permanent_id) >= 1), + pdf_created TIMESTAMP WITH TIME ZONE, + pdf_version TEXT CHECK (octet_length(pdf_version) >= 1), + metadata JSONB + -- maybe some analysis of available fields? + -- metadata JSON fields: + -- title + -- subject + -- author + -- creator + -- producer + -- CrossMarkDomains + -- doi + -- form + -- encrypted +); + CREATE TABLE IF NOT EXISTS ingest_request ( link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1), link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1), |