grobid persist: if status_code is not set, default to 0bnewbold-persist-grobid-errors

We have to set something currently because of a NOT NULL constraint on the table. Originally I thought we would just not record rows if there was an error, and that is still sort of a valid stance. However, when doing bulk GROBID-ing from cdx table, there exist some "bad" CDX rows which cause wayback or petabox errors. We should fix bugs or delete these rows as a cleanup, but until that happens we should record the error state so we don't loop forever. One danger of this commit is that we can clobber existing good rows with new errors rapidly if there is wayback downtime or something like that.
author: Bryan Newbold <bnewbold@archive.org> 2020-01-28 19:06:25 -0800
committer: Bryan Newbold <bnewbold@archive.org> 2020-01-28 19:10:40 -0800
commit: 480dc3fa20102ba0a15013954e76d3b1f826026c (patch)
tree: 8000344605d23ffbec4a57e518c052f5b77e304b
parent: 9f53880c746b9fd84261e3ab7dbbee81501df394 (diff)
download: sandcrawler-bnewbold-persist-grobid-errors.tar.gz
sandcrawler-bnewbold-persist-grobid-errors.zip
3 files changed, 3 insertions, 7 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 3ec325e..5662b32 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -161,7 +161,8 @@ class SandcrawlerPostgresClient:
                 r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
         batch = [(d['key'],
                   d.get('grobid_version') or None,
-                  d['status_code'],
+                  # status_code is validly not set if there was, eg, error-wayback in grobid-worker
+                  d.get('status_code') or 0,
                   d['status'],
                   d.get('fatcat_release') or None,
                   d.get('updated') or datetime.datetime.now(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 9ef3e93..77a1a82 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -222,12 +222,6 @@ class PersistGrobidWorker(SandcrawlerWorker):
     def push_batch(self, batch):
         self.counts['total'] += len(batch)
 
-        # filter out bad "missing status_code" timeout rows
-        missing = [r for r in batch if not r.get('status_code')]
-        if missing:
-            self.counts['skip-missing-status'] += len(missing)
-            batch = [r for r in batch if r.get('status_code')]
-
         for r in batch:
             if r['status_code'] != 200 or not r.get('tei_xml'):
                 self.counts['s3-skip-status'] += 1
diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql
index 0b2b19c..6ba06f1 100644
--- a/sql/migrations/2019-12-19-060141_init/up.sql
+++ b/sql/migrations/2019-12-19-060141_init/up.sql
@@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS grobid (
     sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),
     updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,
     grobid_version      TEXT CHECK (octet_length(grobid_version) >= 1),
+    -- TODO: status_code is validly null if there was a wayback or petabox error. We want to record these cases so we don't loop re-processing forever
     status_code         INT NOT NULL,
     status              TEXT CHECK (octet_length(status) >= 1),
     fatcat_release      TEXT CHECK (octet_length(fatcat_release) = 26),
author	Bryan Newbold <bnewbold@archive.org>	2020-01-28 19:06:25 -0800
committer	Bryan Newbold <bnewbold@archive.org>	2020-01-28 19:10:40 -0800
commit	480dc3fa20102ba0a15013954e76d3b1f826026c (patch)
tree	8000344605d23ffbec4a57e518c052f5b77e304b
parent	9f53880c746b9fd84261e3ab7dbbee81501df394 (diff)
download	sandcrawler-bnewbold-persist-grobid-errors.tar.gz sandcrawler-bnewbold-persist-grobid-errors.zip