aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/sandcrawler/db.py3
-rw-r--r--python/sandcrawler/persist.py6
-rw-r--r--sql/migrations/2019-12-19-060141_init/up.sql1
3 files changed, 3 insertions, 7 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 3ec325e..5662b32 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -161,7 +161,8 @@ class SandcrawlerPostgresClient:
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
batch = [(d['key'],
d.get('grobid_version') or None,
- d['status_code'],
+ # status_code is validly not set if there was, eg, error-wayback in grobid-worker
+ d.get('status_code') or 0,
d['status'],
d.get('fatcat_release') or None,
d.get('updated') or datetime.datetime.now(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 9ef3e93..77a1a82 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -222,12 +222,6 @@ class PersistGrobidWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
- # filter out bad "missing status_code" timeout rows
- missing = [r for r in batch if not r.get('status_code')]
- if missing:
- self.counts['skip-missing-status'] += len(missing)
- batch = [r for r in batch if r.get('status_code')]
-
for r in batch:
if r['status_code'] != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 0b2b19c..6ba06f1 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS grobid (
sha1hex TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
updated TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
grobid_version TEXT CHECK (octet_length(grobid_version) >= 1),
+ -- TODO: status_code is validly null if there was a wayback or petabox error. We want to record these cases so we don't loop re-processing forever
status_code INT NOT NULL,
status TEXT CHECK (octet_length(status) >= 1),
fatcat_release TEXT CHECK (octet_length(fatcat_release) = 26),