1 files changed, 33 insertions, 5 deletions
diff --git a/please b/please
index 023eeac..c888bbc 100755
--- a/please
+++ b/please
@@ -2,8 +2,8 @@
 """
 Helper script for running Sandcrawler (journal pipeline) tasks in production.
 
-This is basically a Makefile. Be sure to only use python3 standard library
-modules, so there are no dependencies.
+This is basically a Makefile. If you edit this file, be sure to only use
+python3 standard library modules, so there are no dependencies.
 """
 
 import sys
@@ -18,7 +18,7 @@ GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
 
 def rebuild_python():
     print("Rebuilding python venv...")
-    cmd = """cd mapreduce;
+    cmd = """cd python;
         export PIPENV_VENV_IN_PROJECT=1;
         pipenv install --deploy
         tar -czf venv-current.tar.gz -C .venv ."""
@@ -37,7 +37,7 @@ def run_backfill(args):
         HDFS_DIR,
         args.env,
         datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
-    cmd = """cd mapreduce;
+    cmd = """cd python;
         pipenv run ./backfill_hbase_from_cdx.py \
             --hbase-host {hbase_host} \
             --hbase-table wbgrp-journal-extract-0-{env} \
@@ -57,7 +57,7 @@ def run_extract(args):
         HDFS_DIR,
         args.env,
         datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
-    cmd = """cd mapreduce;
+    cmd = """cd python;
         pipenv run ./extraction_cdx_grobid.py \
             --hbase-host {hbase_host} \
             --hbase-table wbgrp-journal-extract-0-{env} \
@@ -215,6 +215,24 @@ def run_colcount(args):
             env=args.env)
     subprocess.call(cmd, shell=True)
 
+def run_matchbenchmark(args):
+    if args.rebuild:
+        rebuild_scalding()
+    print("Starting matchbenchmark job...")
+    cmd = """./pig/deps/hadoop/bin/hadoop jar \
+        scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+        com.twitter.scalding.Tool \
+        sandcrawler.MatchBenchmarkJob \
+        --local \
+        --app.conf.path scalding/ia_cluster.conf \
+        --left-bibjson {left_bibjson} \
+        --right-bibjson {right_bibjson} \
+        --output {output}""".format(
+            output=args.output,
+            left_bibjson=args.left_bibjson,
+            right_bibjson=args.right_bibjson)
+    subprocess.call(cmd, shell=True)
+
 def run_keysmissingcol(args):
     if args.rebuild:
         rebuild_scalding()
@@ -288,6 +306,15 @@ def main():
     sub_colcount.add_argument('column',
         help="column name to use in count")
 
+    sub_matchbenchmark = subparsers.add_parser('match-benchmark')
+    sub_matchbenchmark.set_defaults(func=run_matchbenchmark)
+    sub_matchbenchmark.add_argument('left_bibjson',
+        help="First bibjson file")
+    sub_matchbenchmark.add_argument('right_bibjson',
+        help="Second bibjson file")
+    sub_matchbenchmark.add_argument('output',
+        help="where to write output")
+
     sub_keysmissingcol = subparsers.add_parser('keys-missing-col')
     sub_keysmissingcol.set_defaults(func=run_keysmissingcol)
     sub_keysmissingcol.add_argument('column',
@@ -299,6 +326,7 @@ def main():
         sys.exit(-1)
     if not (args.prod or args.qa) or (args.prod and args.qa):
         print("must pass one of --prod or --qa")
+        sys.exit(-1)
     if args.prod:
         args.env = "prod"
     if args.qa: