aboutsummaryrefslogtreecommitdiffstats
path: root/backfill/backfill_hbase_from_cdx.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-03 02:25:03 +0000
committerBryan Newbold <bnewbold@archive.org>2018-04-03 02:25:03 +0000
commit630e956c05604aaf8bf5b7154a01ad956b13e440 (patch)
treee397d981624664e907665c2f485930d02b23fc2a /backfill/backfill_hbase_from_cdx.py
parentd8f3c2ffad0b685db6c3196ac3efe846c019f6d7 (diff)
downloadsandcrawler-630e956c05604aaf8bf5b7154a01ad956b13e440.tar.gz
sandcrawler-630e956c05604aaf8bf5b7154a01ad956b13e440.zip
fix silly bugs in backfiller (need more tests)
Diffstat (limited to 'backfill/backfill_hbase_from_cdx.py')
-rwxr-xr-xbackfill/backfill_hbase_from_cdx.py7
1 files changed, 4 insertions, 3 deletions
diff --git a/backfill/backfill_hbase_from_cdx.py b/backfill/backfill_hbase_from_cdx.py
index 04ae7d9..e6596d5 100755
--- a/backfill/backfill_hbase_from_cdx.py
+++ b/backfill/backfill_hbase_from_cdx.py
@@ -167,17 +167,18 @@ class MRCDXBackfillHBase(MRJob):
raw_cdx.startswith('#')):
# Skip line
+ # XXX: tests don't cover this path; need coverage!
self.increment_counter('lines', 'invalid')
- return _, status
+ return _, dict(status="invalid")
info = transform_line(raw_cdx)
if info is None:
self.increment_counter('lines', 'invalid')
- return
+ return _, dict(status="invalid")
if info['file:mime'] not in self.mime_filter:
self.increment_counter('lines', 'skip')
- return
+ return _, dict(status="skip")
key = info.pop('key')
info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None)