From a8cd91e6f6fb6dafac35f8c239113b55b2230b13 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 24 Dec 2019 16:39:52 -0800 Subject: have JsonLinePusher continue on JSON decode errors (but count) --- python/sandcrawler/workers.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'python/sandcrawler/workers.py') diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index a8b03c7..a110754 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -219,7 +219,11 @@ class JsonLinePusher(RecordPusher): if not line: continue self.counts['total'] += 1 - record = json.loads(line) + try: + record = json.loads(line) + except json.decoder.JSONDecodeError: + self.counts['error-json-decode'] += 1 + continue if self.batch_size: batch.append(record) if len(batch) >= self.batch_size: -- cgit v1.2.3