aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--proposals/2020_pdf_meta_thumbnails.md10
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql26
2 files changed, 31 insertions, 5 deletions
diff --git a/proposals/2020_pdf_meta_thumbnails.md b/proposals/2020_pdf_meta_thumbnails.md
index eacbfa5..793d6b5 100644
--- a/proposals/2020_pdf_meta_thumbnails.md
+++ b/proposals/2020_pdf_meta_thumbnails.md
@@ -45,15 +45,15 @@ Kafka, and we don't want SQL table size to explode. Schema:
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
- page0_thumbnail BOOLEAN NOT NULL,
+ has_page0_thumbnail BOOLEAN NOT NULL,
page_count INT CHECK (page_count >= 0),
word_count INT CHECK (word_count >= 0),
- page0_height FLOAT CHECK (page0_height >= 0),
- page0_width FLOAT CHECK (page0_width >= 0),
+ page0_height REAL CHECK (page0_height >= 0),
+ page0_width REAL CHECK (page0_width >= 0),
permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
- creation date TIMESTAMP WITH TIME ZONE,
+ pdf_created TIMESTAMP WITH TIME ZONE,
pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
- metadata JSONB;
+ metadata JSONB
-- maybe some analysis of available fields?
-- metadata JSON fields:
-- title
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 688487f..59423dd 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -88,6 +88,32 @@ CREATE TABLE IF NOT EXISTS pdftrio (
image_score REAL
);
+CREATE TABLE IF NOT EXISTS pdf_meta (
+ sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
+ updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
+ status TEXT CHECK (octet_length(status) >= 1) NOT NULL,
+ has_page0_thumbnail BOOLEAN NOT NULL,
+ page_count INT CHECK (page_count >= 0),
+ word_count INT CHECK (word_count >= 0),
+ page0_height REAL CHECK (page0_height >= 0),
+ page0_width REAL CHECK (page0_width >= 0),
+ permanent_id TEXT CHECK (octet_length(permanent_id) >= 1),
+ pdf_created TIMESTAMP WITH TIME ZONE,
+ pdf_version TEXT CHECK (octet_length(pdf_version) >= 1),
+ metadata JSONB
+ -- maybe some analysis of available fields?
+ -- metadata JSON fields:
+ -- title
+ -- subject
+ -- author
+ -- creator
+ -- producer
+ -- CrossMarkDomains
+ -- doi
+ -- form
+ -- encrypted
+);
+
CREATE TABLE IF NOT EXISTS ingest_request (
link_source TEXT NOT NULL CHECK (octet_length(link_source) >= 1),
link_source_id TEXT NOT NULL CHECK (octet_length(link_source_id) >= 1),