aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-21 11:32:49 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-21 11:32:51 -0800
commit20291471b34ea559d2ea5d45f3b05884e54d179a (patch)
tree772b58f3fe4091e30e9477e43351c1778f421e40
parent8b9acb1d31b4b8ae84a5133e947ca0a577cd98d8 (diff)
downloadsandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.tar.gz
sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.zip
persist grobid: actually, status_code is required
Instead of working around when missing, force it to exist but skip in database insert section. Disk mode still needs to check if blank.
-rw-r--r--python/sandcrawler/db.py2
-rw-r--r--python/sandcrawler/persist.py11
2 files changed, 10 insertions, 3 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index e1414ba..3ec325e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -161,7 +161,7 @@ class SandcrawlerPostgresClient:
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
batch = [(d['key'],
d.get('grobid_version') or None,
- d.get('status_code') or None,
+ d['status_code'],
d['status'],
d.get('fatcat_release') or None,
d.get('updated') or datetime.datetime.now(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 801f76d..9ef3e93 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -222,8 +222,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
+ # filter out bad "missing status_code" timeout rows
+ missing = [r for r in batch if not r.get('status_code')]
+ if missing:
+ self.counts['skip-missing-status'] += len(missing)
+ batch = [r for r in batch if r.get('status_code')]
+
for r in batch:
- if r.get('status_code') != 200 or not r.get('tei_xml'):
+ if r['status_code'] != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1
if r.get('error_msg'):
r['metadata'] = {'error_msg': r['error_msg'][:500]}
@@ -265,7 +271,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['insert-file-meta'] += resp[0]
self.counts['update-file-meta'] += resp[1]
- self.db.commit()
+ self.db.commit()
+
return []