aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2019-12-24 16:39:52 -0800
committerBryan Newbold <bnewbold@archive.org>2020-01-02 18:12:58 -0800
commita8cd91e6f6fb6dafac35f8c239113b55b2230b13 (patch)
tree6abb0959665d4107176780a455d95f68ed13f9f7
parent460843e31ebea16fcb543b8448365cfe004103b0 (diff)
downloadsandcrawler-a8cd91e6f6fb6dafac35f8c239113b55b2230b13.tar.gz
sandcrawler-a8cd91e6f6fb6dafac35f8c239113b55b2230b13.zip
have JsonLinePusher continue on JSON decode errors (but count)
-rw-r--r--python/sandcrawler/workers.py6
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py
index a8b03c7..a110754 100644
--- a/python/sandcrawler/workers.py
+++ b/python/sandcrawler/workers.py
@@ -219,7 +219,11 @@ class JsonLinePusher(RecordPusher):
if not line:
continue
self.counts['total'] += 1
- record = json.loads(line)
+ try:
+ record = json.loads(line)
+ except json.decoder.JSONDecodeError:
+ self.counts['error-json-decode'] += 1
+ continue
if self.batch_size:
batch.append(record)
if len(batch) >= self.batch_size: