diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/db.py | 3 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 6 |
2 files changed, 2 insertions, 7 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py index 3ec325e..5662b32 100644 --- a/python/sandcrawler/db.py +++ b/python/sandcrawler/db.py @@ -161,7 +161,8 @@ class SandcrawlerPostgresClient: r['metadata'] = json.dumps(r['metadata'], sort_keys=True) batch = [(d['key'], d.get('grobid_version') or None, - d['status_code'], + # status_code is validly not set if there was, eg, error-wayback in grobid-worker + d.get('status_code') or 0, d['status'], d.get('fatcat_release') or None, d.get('updated') or datetime.datetime.now(), diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index 9ef3e93..77a1a82 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -222,12 +222,6 @@ class PersistGrobidWorker(SandcrawlerWorker): def push_batch(self, batch): self.counts['total'] += len(batch) - # filter out bad "missing status_code" timeout rows - missing = [r for r in batch if not r.get('status_code')] - if missing: - self.counts['skip-missing-status'] += len(missing) - batch = [r for r in batch if r.get('status_code')] - for r in batch: if r['status_code'] != 200 or not r.get('tei_xml'): self.counts['s3-skip-status'] += 1 |