diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2019-12-24 16:39:52 -0800 |
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-01-02 18:12:58 -0800 |
| commit | a8cd91e6f6fb6dafac35f8c239113b55b2230b13 (patch) | |
| tree | 6abb0959665d4107176780a455d95f68ed13f9f7 | |
| parent | 460843e31ebea16fcb543b8448365cfe004103b0 (diff) | |
| download | sandcrawler-a8cd91e6f6fb6dafac35f8c239113b55b2230b13.tar.gz sandcrawler-a8cd91e6f6fb6dafac35f8c239113b55b2230b13.zip | |
have JsonLinePusher continue on JSON decode errors (but count)
| -rw-r--r-- | python/sandcrawler/workers.py | 6 |
1 files changed, 5 insertions, 1 deletions
diff --git a/python/sandcrawler/workers.py b/python/sandcrawler/workers.py index a8b03c7..a110754 100644 --- a/python/sandcrawler/workers.py +++ b/python/sandcrawler/workers.py @@ -219,7 +219,11 @@ class JsonLinePusher(RecordPusher): if not line: continue self.counts['total'] += 1 - record = json.loads(line) + try: + record = json.loads(line) + except json.decoder.JSONDecodeError: + self.counts['error-json-decode'] += 1 + continue if self.batch_size: batch.append(record) if len(batch) >= self.batch_size: |
