diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 19:24:07 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-04-05 19:24:07 -0700 |
commit | e68d43e2369eed7ddf288be8c8f2edd0a85974e1 (patch) | |
tree | 204c2dd9e2933c848affe76f6c56aaf2b032d4d4 /mapreduce/backfill_hbase_from_cdx.py | |
parent | 5db075beaa55b2d619798154c06c2df625346972 (diff) | |
download | sandcrawler-e68d43e2369eed7ddf288be8c8f2edd0a85974e1.tar.gz sandcrawler-e68d43e2369eed7ddf288be8c8f2edd0a85974e1.zip |
make happybase mock injection slightly less horrible
Diffstat (limited to 'mapreduce/backfill_hbase_from_cdx.py')
-rwxr-xr-x | mapreduce/backfill_hbase_from_cdx.py | 28 |
1 files changed, 12 insertions, 16 deletions
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py index 57c18e4..2643195 100755 --- a/mapreduce/backfill_hbase_from_cdx.py +++ b/mapreduce/backfill_hbase_from_cdx.py @@ -43,27 +43,23 @@ class MRCDXBackfillHBase(MRJob): help='HBase thrift API host to connect to') def __init__(self, *args, **kwargs): - - # Allow passthrough for tests - if 'hb_table' in kwargs: - self.hb_table = kwargs.pop('hb_table') - else: - self.hb_table = None - super(MRCDXBackfillHBase, self).__init__(*args, **kwargs) self.mime_filter = ['application/pdf'] + self.hb_table = None def mapper_init(self): - if self.hb_table is None: - try: - host = self.options.hbase_host - # TODO: make these configs accessible from... mrconf.cfg? - hb_conn = happybase.Connection(host=host, transport="framed", - protocol="compact") - except Exception as err: - raise Exception("Couldn't connect to HBase using host: {}".format(host)) - self.hb_table = hb_conn.table(self.options.hbase_table) + if self.hb_table: + return + + try: + host = self.options.hbase_host + # TODO: make these configs accessible from... mrconf.cfg? + hb_conn = happybase.Connection(host=host, transport="framed", + protocol="compact") + except Exception as err: + raise Exception("Couldn't connect to HBase using host: {}".format(host)) + self.hb_table = hb_conn.table(self.options.hbase_table) def mapper(self, _, raw_cdx): |