aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/db.py3
-rw-r--r--python/sandcrawler/persist.py6
2 files changed, 2 insertions, 7 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index 3ec325e..5662b32 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -161,7 +161,8 @@ class SandcrawlerPostgresClient:
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
batch = [(d['key'],
d.get('grobid_version') or None,
- d['status_code'],
+ # status_code is validly not set if there was, eg, error-wayback in grobid-worker
+ d.get('status_code') or 0,
d['status'],
d.get('fatcat_release') or None,
d.get('updated') or datetime.datetime.now(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 9ef3e93..77a1a82 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -222,12 +222,6 @@ class PersistGrobidWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
- # filter out bad "missing status_code" timeout rows
- missing = [r for r in batch if not r.get('status_code')]
- if missing:
- self.counts['skip-missing-status'] += len(missing)
- batch = [r for r in batch if r.get('status_code')]
-
for r in batch:
if r['status_code'] != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1