aboutsummaryrefslogtreecommitdiffstats
path: root/sql/migrations/2019-12-19-060141_init
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-15 13:54:02 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-15 13:54:02 -0800
commitd06fd45e3c86cb080ad7724f3fc7575750a9cd69 (patch)
tree12862548aa14870af3c710076a6df8441f2ddb4e /sql/migrations/2019-12-19-060141_init
parent4d0224f3e73315ef4db39643e6d4851e4a466658 (diff)
downloadsandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.tar.gz
sandcrawler-d06fd45e3c86cb080ad7724f3fc7575750a9cd69.zip
clarify ingest result schema and semantics
Diffstat (limited to 'sql/migrations/2019-12-19-060141_init')
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql16
1 files changed, 16 insertions, 0 deletions
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 12ed409..0b2b19c 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -1,10 +1,24 @@
+-- rows *may* be revisit records; indicated by mimetype == "warc/revisit"
+-- records are implied to be 200 status (or 226 for ftp); either direct hits or
+-- revisits
+-- there is nothing to prevent duplicate hits. eg, same sha1, same url, many
+-- datetimes. import scripts should take efforts to reduce this sort of
+-- duplication though. one row per *domain*/sha1hex pair is a good guideline.
+-- all ingest result url/dt pairs should be included though.
+-- any mimetype is allowed, but presumption should be that actual body is full
+-- manifestation of a work. AKA, no landing pages, no webcapture HTML (each
+-- only a part of work). URLs that are parts of a fileset are allowed.
CREATE TABLE IF NOT EXISTS cdx (
url TEXT NOT NULL CHECK (octet_length(url) >= 1),
datetime TEXT NOT NULL CHECK (octet_length(datetime) = 14),
+ -- sha1hex/cdx_sha1hex difference is intended to help with difference between
+ -- CDX hash (which is transport encoded body) vs. actual body. Probably need to
+ -- include both for all records?
sha1hex TEXT NOT NULL CHECK (octet_length(sha1hex) = 40),
cdx_sha1hex TEXT CHECK (octet_length(cdx_sha1hex) = 40),
mimetype TEXT CHECK (octet_length(mimetype) >= 1),
+ -- TODO: enforce that only paths with '/' (item+file) should be included?
warc_path TEXT CHECK (octet_length(warc_path) >= 1),
warc_csize BIGINT,
warc_offset BIGINT,
@@ -12,8 +26,10 @@ CREATE TABLE IF NOT EXISTS cdx (
PRIMARY KEY(url, datetime)
);
CREATE INDEX IF NOT EXISTS cdx_sha1hex_idx ON cdx(sha1hex);
+-- TODO: remove this index? not currently used
CREATE INDEX IF NOT EXISTS cdx_row_created_idx ON cdx(row_created);
+-- TODO: require all fields. if mimetype unknown, should be octet-stream
CREATE TABLE IF NOT EXISTS file_meta (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
sha256hex TEXT CHECK (octet_length(sha256hex) = 64),