aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-15 19:10:13 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-15 19:10:13 -0700
commitfafe5b1b2d8f34c6f336b7ae1a48cc78deb90c11 (patch)
treeda74f62e2481dcaeee5260e742ffba4695656770
parent3ff30c8f20d36f8e47ec5478c10c3348d2f45fa6 (diff)
downloadsandcrawler-fafe5b1b2d8f34c6f336b7ae1a48cc78deb90c11.tar.gz
sandcrawler-fafe5b1b2d8f34c6f336b7ae1a48cc78deb90c11.zip
update 'please' command for scoring refactor
-rwxr-xr-xplease11
1 files changed, 10 insertions, 1 deletions
diff --git a/please b/please
index 3563343..1a992f2 100755
--- a/please
+++ b/please
@@ -124,9 +124,13 @@ def run_matchcrossref(args):
HDFS_DIR,
args.env,
datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+ # Notes: -D options must come after Tool but before class name
+ # https://github.com/twitter/scalding/wiki/Frequently-asked-questions#how-do-i-pass-parameters-to-my-hadoop-job-number-of-reducers--memory-options--etc-
cmd = """hadoop jar \
scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
- com.twitter.scalding.Tool sandcrawler.HBaseCrossrefScoreJob \
+ com.twitter.scalding.Tool \
+ -Dmapred.reduce.tasks={reducers} \
+ sandcrawler.ScoreJob \
--hdfs \
--app.conf.path scalding/ia_cluster.conf \
--hbase-table wbgrp-journal-extract-0-{env} \
@@ -136,6 +140,7 @@ def run_matchcrossref(args):
output=output,
zookeeper_hosts=ZOOKEEPER_HOSTS,
env=args.env,
+ reducers=args.reducers,
crossref_input=args.crossref_input)
subprocess.call(cmd, shell=True)
@@ -173,6 +178,10 @@ def main():
sub_matchcrossref.set_defaults(func=run_matchcrossref)
sub_matchcrossref.add_argument('crossref_input',
help="full HDFS path of Crossref JSON dump")
+ sub_matchcrossref.add_argument('--reducers',
+ help="number of reducers to run",
+ type=int, default=30)
+
args = parser.parse_args()
if not args.__dict__.get("func"):