aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/backfill_hbase_from_cdx.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-05 19:24:07 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-05 19:24:07 -0700
commite68d43e2369eed7ddf288be8c8f2edd0a85974e1 (patch)
tree204c2dd9e2933c848affe76f6c56aaf2b032d4d4 /mapreduce/backfill_hbase_from_cdx.py
parent5db075beaa55b2d619798154c06c2df625346972 (diff)
downloadsandcrawler-e68d43e2369eed7ddf288be8c8f2edd0a85974e1.tar.gz
sandcrawler-e68d43e2369eed7ddf288be8c8f2edd0a85974e1.zip
make happybase mock injection slightly less horrible
Diffstat (limited to 'mapreduce/backfill_hbase_from_cdx.py')
-rwxr-xr-xmapreduce/backfill_hbase_from_cdx.py28
1 files changed, 12 insertions, 16 deletions
diff --git a/mapreduce/backfill_hbase_from_cdx.py b/mapreduce/backfill_hbase_from_cdx.py
index 57c18e4..2643195 100755
--- a/mapreduce/backfill_hbase_from_cdx.py
+++ b/mapreduce/backfill_hbase_from_cdx.py
@@ -43,27 +43,23 @@ class MRCDXBackfillHBase(MRJob):
help='HBase thrift API host to connect to')
def __init__(self, *args, **kwargs):
-
- # Allow passthrough for tests
- if 'hb_table' in kwargs:
- self.hb_table = kwargs.pop('hb_table')
- else:
- self.hb_table = None
-
super(MRCDXBackfillHBase, self).__init__(*args, **kwargs)
self.mime_filter = ['application/pdf']
+ self.hb_table = None
def mapper_init(self):
- if self.hb_table is None:
- try:
- host = self.options.hbase_host
- # TODO: make these configs accessible from... mrconf.cfg?
- hb_conn = happybase.Connection(host=host, transport="framed",
- protocol="compact")
- except Exception as err:
- raise Exception("Couldn't connect to HBase using host: {}".format(host))
- self.hb_table = hb_conn.table(self.options.hbase_table)
+ if self.hb_table:
+ return
+
+ try:
+ host = self.options.hbase_host
+ # TODO: make these configs accessible from... mrconf.cfg?
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception as err:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.options.hbase_table)
def mapper(self, _, raw_cdx):