diff options
-rw-r--r-- | python/sandcrawler/db.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 6 | ||||
-rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 1 |
3 files changed, 3 insertions, 7 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 3ec325e..5662b32 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -161,7 +161,8 @@ class SandcrawlerPostgresClient: r['metadata'] = json.dumps(r['metadata'], sort_keys=True) batch = [(d['key'], d.get('grobid_version') or None, - d['status_code'], + # status_code is validly not set if there was, eg, error-wayback in grobid-worker + d.get('status_code') or 0, d['status'], d.get('fatcat_release') or None, d.get('updated') or datetime.datetime.now(), diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 9ef3e93..77a1a82 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -222,12 +222,6 @@ class PersistGrobidWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) - # filter out bad "missing status_code" timeout rows - missing = [r for r in batch if not r.get('status_code')] - if missing: - self.counts['skip-missing-status'] += len(missing) - batch = [r for r in batch if r.get('status_code')] - for r in batch: if r['status_code'] != 200 or not r.get('tei_xml'): self.counts['s3-skip-status'] += 1 diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 0b2b19c..6ba06f1 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS grobid ( sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40), updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL, grobid_version TEXT CHECK (octet_length(grobid_version) >= 1), + -- TODO: status_code is validly null if there was a wayback or petabox error. We want to record these cases so we don't loop re-processing forever status_code INT NOT NULL, status TEXT CHECK (octet_length(status) >= 1), fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26), |