aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-21 11:32:49 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-21 11:32:51 -0800
commit20291471b34ea559d2ea5d45f3b05884e54d179a (patch)
tree772b58f3fe4091e30e9477e43351c1778f421e40 /python
parent8b9acb1d31b4b8ae84a5133e947ca0a577cd98d8 (diff)
downloadsandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.tar.gz
sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.zip
persist grobid: actually, status_code is required
Instead of working around when missing, force it to exist but skip in database insert section. Disk mode still needs to check if blank.
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/db.py2
-rw-r--r--python/sandcrawler/persist.py11
2 files changed, 10 insertions, 3 deletions
diff --git a/python/sandcrawler/db.py b/python/sandcrawler/db.py
index e1414ba..3ec325e 100644
--- a/python/sandcrawler/db.py
+++ b/python/sandcrawler/db.py
@@ -161,7 +161,7 @@ class SandcrawlerPostgresClient:
r['metadata'] = json.dumps(r['metadata'], sort_keys=True)
batch = [(d['key'],
d.get('grobid_version') or None,
- d.get('status_code') or None,
+ d['status_code'],
d['status'],
d.get('fatcat_release') or None,
d.get('updated') or datetime.datetime.now(),
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 801f76d..9ef3e93 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -222,8 +222,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
+ # filter out bad "missing status_code" timeout rows
+ missing = [r for r in batch if not r.get('status_code')]
+ if missing:
+ self.counts['skip-missing-status'] += len(missing)
+ batch = [r for r in batch if r.get('status_code')]
+
for r in batch:
- if r.get('status_code') != 200 or not r.get('tei_xml'):
+ if r['status_code'] != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1
if r.get('error_msg'):
r['metadata'] = {'error_msg': r['error_msg'][:500]}
@@ -265,7 +271,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['insert-file-meta'] += resp[0]
self.counts['update-file-meta'] += resp[1]
- self.db.commit()
+ self.db.commit()
+
return []