aboutsummaryrefslogtreecommitdiffstats
path: root/please
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-07-15 22:49:07 +0000
committerBryan Newbold <bnewbold@archive.org>2018-07-15 22:49:07 +0000
commit746870a10215549c25a16529eabaeb199a3b9228 (patch)
tree2d91dd8f2b153738d6f1370b4623b6d816bceba5 /please
parentf955fef045d3c78e351a639546a27ab0a53fc0aa (diff)
downloadsandcrawler-746870a10215549c25a16529eabaeb199a3b9228.tar.gz
sandcrawler-746870a10215549c25a16529eabaeb199a3b9228.zip
update please helpers to provide hbase+zk config
Diffstat (limited to 'please')
-rwxr-xr-xplease15
1 files changed, 13 insertions, 2 deletions
diff --git a/please b/please
index 2d4cae8..a244b80 100755
--- a/please
+++ b/please
@@ -13,6 +13,7 @@ from datetime import datetime
HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler"
HBASE_HOST = "wbgrp-svc263.us.archive.org"
+ZOOKEEPER_HOSTS = "mtrcs-zk1.us.archive.org:2181"
GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
def rebuild_python():
@@ -86,7 +87,12 @@ def run_rowcount(args):
com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob \
--hdfs \
--app.conf.path scalding/ia_cluster.conf \
- --output {}""".format(output)
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
subprocess.call(cmd, shell=True)
def run_statuscount(args):
@@ -102,7 +108,12 @@ def run_statuscount(args):
com.twitter.scalding.Tool sandcrawler.HBaseStatusCountJob \
--hdfs \
--app.conf.path scalding/ia_cluster.conf \
- --output {}""".format(output)
+ --hbase-table wbgrp-journal-extract-0-{env} \
+ --zookeeper-hosts {zookeeper_hosts} \
+ --output {output}""".format(
+ output=output,
+ zookeeper_hosts=ZOOKEEPER_HOSTS,
+ env=args.env)
subprocess.call(cmd, shell=True)
def main():