diff options
author | Bryan Newbold <bnewbold@archive.org> | 2019-12-24 16:39:52 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
commit | a8cd91e6f6fb6dafac35f8c239113b55b2230b13 (patch) | |
tree | 6abb0959665d4107176780a455d95f68ed13f9f7 /python | |
parent | 460843e31ebea16fcb543b8448365cfe004103b0 (diff) | |
download | sandcrawler-a8cd91e6f6fb6dafac35f8c239113b55b2230b13.tar.gz sandcrawler-a8cd91e6f6fb6dafac35f8c239113b55b2230b13.zip |
have JsonLinePusher continue on JSON decode errors (but count)
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/workers.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index a8b03c7..a110754 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -219,7 +219,11 @@ class JsonLinePusher(RecordPusher): if not line: continue self.counts['total'] += 1 - record = json.loads(line) + try: + record = json.loads(line) + except json.decoder.JSONDecodeError: + self.counts['error-json-decode'] += 1 + continue if self.batch_size: batch.append(record) if len(batch) >= self.batch_size: |