diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-03 02:25:03 +0000 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-03 02:25:03 +0000 |
commit | 630e956c05604aaf8bf5b7154a01ad956b13e440 (patch) | |
tree | e397d981624664e907665c2f485930d02b23fc2a /backfill/backfill_hbase_from_cdx.py | |
parent | d8f3c2ffad0b685db6c3196ac3efe846c019f6d7 (diff) | |
download | sandcrawler-630e956c05604aaf8bf5b7154a01ad956b13e440.tar.gz sandcrawler-630e956c05604aaf8bf5b7154a01ad956b13e440.zip |
fix silly bugs in backfiller (need more tests)
Diffstat (limited to 'backfill/backfill_hbase_from_cdx.py')
-rwxr-xr-x | backfill/backfill_hbase_from_cdx.py | 7 |
1 files changed, 4 insertions, 3 deletions
diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py index 04ae7d9..e6596d5 100755 --- a/backfill/backfill_hbase_from_cdx.py +++ b/backfill/backfill_hbase_from_cdx.py @@ -167,17 +167,18 @@ class MRCDXBackfillHBase(MRJob): raw_cdx.startswith('#')): # Skip line + # XXX: tests don't cover this path; need coverage! self.increment_counter('lines', 'invalid') - return _, status + return _, dict(status="invalid") info = transform_line(raw_cdx) if info is None: self.increment_counter('lines', 'invalid') - return + return _, dict(status="invalid") if info['file:mime'] not in self.mime_filter: self.increment_counter('lines', 'skip') - return + return _, dict(status="skip") key = info.pop('key') info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None) |