From 480dc3fa20102ba0a15013954e76d3b1f826026c Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 28 Jan 2020 19:06:25 -0800 Subject: grobid persist: if status_code is not set, default to 0 We have to set something currently because of a NOT NULL constraint on the table. Originally I thought we would just not record rows if there was an error, and that is still sort of a valid stance. However, when doing bulk GROBID-ing from cdx table, there exist some "bad" CDX rows which cause wayback or petabox errors. We should fix bugs or delete these rows as a cleanup, but until that happens we should record the error state so we don't loop forever. One danger of this commit is that we can clobber existing good rows with new errors rapidly if there is wayback downtime or something like that. --- sql/migrations/2019-12-19-060141_init/up.sql | 1 + 1 file changed, 1 insertion(+) (limited to 'sql') diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 0b2b19c..6ba06f1 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS grobid ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), + -- TODO: status_code is validly null if there was a wayback or petabox error. We want to record these cases so we don't loop re-processing forever status_code INT NOT NULL, status TEXT CHECK (octet_length(status) >= 1), fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), -- cgit v1.2.3