aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/backfill_hbase_from_cdx.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-05 17:18:52 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-05 17:18:52 -0700
commit77577da13afe07b5177452122f4cee77e3357b4e (patch)
treed37e411121abb805c272f48a6f4579e7af650bb1 /mapreduce/backfill_hbase_from_cdx.py
parent37a775851b2d21f2afe7418a8628c50ae37edc5b (diff)
downloadsandcrawler-77577da13afe07b5177452122f4cee77e3357b4e.tar.gz
sandcrawler-77577da13afe07b5177452122f4cee77e3357b4e.zip
improve test coverage
Diffstat (limited to 'mapreduce/backfill_hbase_from_cdx.py')
-rwxr-xr-xmapreduce/backfill_hbase_from_cdx.py11
1 files changed, 7 insertions, 4 deletions
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py
index 8a28ec1..57c18e4 100755
--- a/mapreduce/backfill_hbase_from_cdx.py
+++ b/mapreduce/backfill_hbase_from_cdx.py
@@ -75,16 +75,19 @@ class MRCDXBackfillHBase(MRJob):
# Skip line
# XXX: tests don't cover this path; need coverage!
self.increment_counter('lines', 'invalid')
- return _, dict(status="invalid")
+ yield _, dict(status="invalid", reason="line prefix")
+ return
info = parse_cdx_line(raw_cdx)
if info is None:
self.increment_counter('lines', 'invalid')
- return _, dict(status="invalid")
+ yield _, dict(status="invalid")
+ return
if info['file:mime'] not in self.mime_filter:
self.increment_counter('lines', 'skip')
- return _, dict(status="skip")
+ yield _, dict(status="skip", reason="unwanted mimetype")
+ return
key = info.pop('key')
info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None)
@@ -96,6 +99,6 @@ class MRCDXBackfillHBase(MRJob):
yield _, dict(status="success")
-if __name__ == '__main__':
+if __name__ == '__main__': # pragma: no cover
MRCDXBackfillHBase.run()