diff options
| -rw-r--r-- | python/sandcrawler/db.py | 3 | ||||
| -rw-r--r-- | python/sandcrawler/persist.py | 6 | ||||
| -rw-r--r-- | sql/migrations/2019-12-19-060141_init/up.sql | 1 | 
3 files changed, 3 insertions, 7 deletions
| diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 3ec325e..5662b32 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -161,7 +161,8 @@ class SandcrawlerPostgresClient:                  r['metadata'] = json.dumps(r['metadata'], sort_keys=True)          batch = [(d['key'],                    d.get('grobid_version') or None, -                  d['status_code'], +                  # status_code is validly not set if there was, eg, error-wayback in grobid-worker +                  d.get('status_code') or 0,                    d['status'],                    d.get('fatcat_release') or None,                    d.get('updated') or datetime.datetime.now(), diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 9ef3e93..77a1a82 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -222,12 +222,6 @@ class PersistGrobidWorker(SandcrawlerWorker):      def push_batch(self, batch):          self.counts['total'] += len(batch) -        # filter out bad "missing status_code" timeout rows -        missing = [r for r in batch if not r.get('status_code')] -        if missing: -            self.counts['skip-missing-status'] += len(missing) -            batch = [r for r in batch if r.get('status_code')] -          for r in batch:              if r['status_code'] != 200 or not r.get('tei_xml'):                  self.counts['s3-skip-status'] += 1 diff --git a/sql/migrations/2019-12-19-060141_init/up.sql b/sql/migrations/2019-12-19-060141_init/up.sql index 0b2b19c..6ba06f1 100644 --- a/sql/migrations/2019-12-19-060141_init/up.sql +++ b/sql/migrations/2019-12-19-060141_init/up.sql @@ -56,6 +56,7 @@ CREATE TABLE IF NOT EXISTS grobid (      sha1hex             TEXT PRIMARY KEY CHECK (octet_length(sha1hex) = 40),      updated             TIMESTAMP WITH TIME ZONE DEFAULT now() NOT NULL,      grobid_version      TEXT CHECK (octet_length(grobid_version) >= 1), +    -- TODO: status_code is validly null if there was a wayback or petabox error. We want to record these cases so we don't loop re-processing forever      status_code         INT NOT NULL,      status              TEXT CHECK (octet_length(status) >= 1),      fatcat_release      TEXT CHECK (octet_length(fatcat_release) = 26), | 
