aboutsummaryrefslogtreecommitdiffstats
path: root/python/sandcrawler/persist.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-01-21 11:32:49 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-21 11:32:51 -0800
commit20291471b34ea559d2ea5d45f3b05884e54d179a (patch)
tree772b58f3fe4091e30e9477e43351c1778f421e40 /python/sandcrawler/persist.py
parent8b9acb1d31b4b8ae84a5133e947ca0a577cd98d8 (diff)
downloadsandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.tar.gz
sandcrawler-20291471b34ea559d2ea5d45f3b05884e54d179a.zip
persist grobid: actually, status_code is required
Instead of working around when missing, force it to exist but skip in database insert section. Disk mode still needs to check if blank.
Diffstat (limited to 'python/sandcrawler/persist.py')
-rw-r--r--python/sandcrawler/persist.py11
1 files changed, 9 insertions, 2 deletions
diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py
index 801f76d..9ef3e93 100644
--- a/python/sandcrawler/persist.py
+++ b/python/sandcrawler/persist.py
@@ -222,8 +222,14 @@ class PersistGrobidWorker(SandcrawlerWorker):
def push_batch(self, batch):
self.counts['total'] += len(batch)
+ # filter out bad "missing status_code" timeout rows
+ missing = [r for r in batch if not r.get('status_code')]
+ if missing:
+ self.counts['skip-missing-status'] += len(missing)
+ batch = [r for r in batch if r.get('status_code')]
+
for r in batch:
- if r.get('status_code') != 200 or not r.get('tei_xml'):
+ if r['status_code'] != 200 or not r.get('tei_xml'):
self.counts['s3-skip-status'] += 1
if r.get('error_msg'):
r['metadata'] = {'error_msg': r['error_msg'][:500]}
@@ -265,7 +271,8 @@ class PersistGrobidWorker(SandcrawlerWorker):
self.counts['insert-file-meta'] += resp[0]
self.counts['update-file-meta'] += resp[1]
- self.db.commit()
+ self.db.commit()
+
return []