diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 17:18:52 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 17:18:52 -0700 |
commit | 77577da13afe07b5177452122f4cee77e3357b4e (patch) | |
tree | d37e411121abb805c272f48a6f4579e7af650bb1 /mapreduce/backfill_hbase_from_cdx.py | |
parent | 37a775851b2d21f2afe7418a8628c50ae37edc5b (diff) | |
download | sandcrawler-77577da13afe07b5177452122f4cee77e3357b4e.tar.gz sandcrawler-77577da13afe07b5177452122f4cee77e3357b4e.zip |
improve test coverage
Diffstat (limited to 'mapreduce/backfill_hbase_from_cdx.py')
-rwxr-xr-x | mapreduce/backfill_hbase_from_cdx.py | 11 |
1 files changed, 7 insertions, 4 deletions
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py index 8a28ec1..57c18e4 100755 --- a/mapreduce/backfill_hbase_from_cdx.py +++ b/mapreduce/backfill_hbase_from_cdx.py @@ -75,16 +75,19 @@ class MRCDXBackfillHBase(MRJob): # Skip line # XXX: tests don't cover this path; need coverage! self.increment_counter('lines', 'invalid') - return _, dict(status="invalid") + yield _, dict(status="invalid", reason="line prefix") + return info = parse_cdx_line(raw_cdx) if info is None: self.increment_counter('lines', 'invalid') - return _, dict(status="invalid") + yield _, dict(status="invalid") + return if info['file:mime'] not in self.mime_filter: self.increment_counter('lines', 'skip') - return _, dict(status="skip") + yield _, dict(status="skip", reason="unwanted mimetype") + return key = info.pop('key') info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None) @@ -96,6 +99,6 @@ class MRCDXBackfillHBase(MRJob): yield _, dict(status="success") -if __name__ == '__main__': +if __name__ == '__main__': # pragma: no cover MRCDXBackfillHBase.run() |