aboutsummaryrefslogtreecommitdiffstats
path: root/please
diff options
context:
space:
mode:
Diffstat (limited to 'please')
-rwxr-xr-xplease38
1 files changed, 33 insertions, 5 deletions
diff --git a/please b/please
index 023eeac..c888bbc 100755
--- a/please
+++ b/please
@@ -2,8 +2,8 @@
"""
Helper script for running Sandcrawler (journal pipeline) tasks in production.
-This is basically a Makefile. Be sure to only use python3 standard library
-modules, so there are no dependencies.
+This is basically a Makefile. If you edit this file, be sure to only use
+python3 standard library modules, so there are no dependencies.
"""
import sys
@@ -18,7 +18,7 @@ GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
def rebuild_python():
print("Rebuilding python venv...")
- cmd = """cd mapreduce;
+ cmd = """cd python;
export PIPENV_VENV_IN_PROJECT=1;
pipenv install --deploy
tar -czf venv-current.tar.gz -C .venv ."""
@@ -37,7 +37,7 @@ def run_backfill(args):
HDFS_DIR,
args.env,
datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
- cmd = """cd mapreduce;
+ cmd = """cd python;
pipenv run ./backfill_hbase_from_cdx.py \
--hbase-host {hbase_host} \
--hbase-table wbgrp-journal-extract-0-{env} \
@@ -57,7 +57,7 @@ def run_extract(args):
HDFS_DIR,
args.env,
datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
- cmd = """cd mapreduce;
+ cmd = """cd python;
pipenv run ./extraction_cdx_grobid.py \
--hbase-host {hbase_host} \
--hbase-table wbgrp-journal-extract-0-{env} \
@@ -215,6 +215,24 @@ def run_colcount(args):
env=args.env)
subprocess.call(cmd, shell=True)
+def run_matchbenchmark(args):
+ if args.rebuild:
+ rebuild_scalding()
+ print("Starting matchbenchmark job...")
+ cmd = """./pig/deps/hadoop/bin/hadoop jar \
+ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+ com.twitter.scalding.Tool \
+ sandcrawler.MatchBenchmarkJob \
+ --local \
+ --app.conf.path scalding/ia_cluster.conf \
+ --left-bibjson {left_bibjson} \
+ --right-bibjson {right_bibjson} \
+ --output {output}""".format(
+ output=args.output,
+ left_bibjson=args.left_bibjson,
+ right_bibjson=args.right_bibjson)
+ subprocess.call(cmd, shell=True)
+
def run_keysmissingcol(args):
if args.rebuild:
rebuild_scalding()
@@ -288,6 +306,15 @@ def main():
sub_colcount.add_argument('column',
help="column name to use in count")
+ sub_matchbenchmark = subparsers.add_parser('match-benchmark')
+ sub_matchbenchmark.set_defaults(func=run_matchbenchmark)
+ sub_matchbenchmark.add_argument('left_bibjson',
+ help="First bibjson file")
+ sub_matchbenchmark.add_argument('right_bibjson',
+ help="Second bibjson file")
+ sub_matchbenchmark.add_argument('output',
+ help="where to write output")
+
sub_keysmissingcol = subparsers.add_parser('keys-missing-col')
sub_keysmissingcol.set_defaults(func=run_keysmissingcol)
sub_keysmissingcol.add_argument('column',
@@ -299,6 +326,7 @@ def main():
sys.exit(-1)
if not (args.prod or args.qa) or (args.prod and args.qa):
print("must pass one of --prod or --qa")
+ sys.exit(-1)
if args.prod:
args.env = "prod"
if args.qa: