7 files changed, 177 insertions, 14 deletions
diff --git a/.gitignore b/.gitignore
index 8a9f43e..28d3c9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 mapreduce-*.tar.gz
 *,cover
 htmlcov/
+mapreduce/venv-current.tar.gz
 
 *.o
 *.a
diff --git a/README.md b/README.md
index 90bfd75..e53e775 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Pretty much everything here uses python/pipenv. To setup your environment for
 this, and python in general:
 
     # libjpeg-dev is for some wayback/pillow stuff
-    sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essentials
+    sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential
     pip3 install --user pipenv
 
 On macOS:
diff --git a/mapreduce/README.md b/mapreduce/README.md
index b63e84b..aebc160 100644
--- a/mapreduce/README.md
+++ b/mapreduce/README.md
@@ -33,6 +33,7 @@ running on a devbox and GROBID running on a dedicated machine:
 Running from the cluster:
 
     # Create tarball of virtualenv
+    export PIPENV_VENV_IN_PROJECT=1
     pipenv shell
     export VENVSHORT=`basename $VIRTUAL_ENV`
     tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT .
@@ -60,9 +61,9 @@ Actual invocation to run on Hadoop cluster (running on an IA devbox, where
 hadoop environment is configured):
 
     # Create tarball of virtualenv
-    pipenv shell
-    export VENVSHORT=`basename $VIRTUAL_ENV`
-    tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT .
+    export PIPENV_VENV_IN_PROJECT=1
+    pipenv install --deploy
+    tar -czf venv-current.tar.gz -C .venv .
 
     ./backfill_hbase_from_cdx.py \
         --hbase-host wbgrp-svc263.us.archive.org \
diff --git a/please b/please
new file mode 100755
index 0000000..2d4cae8
--- /dev/null
+++ b/please
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+Helper script for running Sandcrawler (journal pipeline) tasks in production.
+
+This is basically a Makefile. Be sure to only use python3 standard library
+modules, so there are no dependencies.
+"""
+
+import sys
+import argparse
+import subprocess
+from datetime import datetime
+
+HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler"
+HBASE_HOST = "wbgrp-svc263.us.archive.org"
+GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070"
+
+def rebuild_python():
+    print("Rebuilding python venv...")
+    cmd = """cd mapreduce;
+        export PIPENV_VENV_IN_PROJECT=1;
+        pipenv install --deploy
+        tar -czf venv-current.tar.gz -C .venv ."""
+    subprocess.call(cmd, shell=True)
+
+def rebuild_scalding():
+    print("Rebuilding scalding jar...")
+    cmd = """cd scalding; sbt assembly"""
+    subprocess.call(cmd, shell=True)
+
+def run_backfill(args):
+    if args.rebuild:
+        rebuild_python()
+    print("Starting backfill job...")
+    output = "{}/output-{}/{}-backfill".format(
+        HDFS_DIR,
+        args.env,
+        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+    cmd = """cd mapreduce;
+        pipenv run ./backfill_hbase_from_cdx.py \
+            --hbase-host {hbase_host} \
+            --hbase-table wbgrp-journal-extract-0-{env} \
+            -r hadoop \
+            -c mrjob.conf \
+            --archive venv-current.tar.gz#venv \
+            {input_cdx}
+            """.format(hbase_host=HBASE_HOST, env=args.env,
+                input_cdx=args.input_cdx)
+    subprocess.call(cmd, shell=True)
+
+def run_extract(args):
+    if args.rebuild:
+        rebuild_python()
+    print("Starting extract job...")
+    output = "{}/output-{}/{}-extract".format(
+        HDFS_DIR,
+        args.env,
+        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+    cmd = """cd mapreduce;
+        pipenv run ./extraction_cdx_grobid.py \
+            --hbase-host {hbase_host} \
+            --hbase-table wbgrp-journal-extract-0-{env} \
+            --grobid-uri {grobid_uri} \
+            -r hadoop \
+            -c mrjob.conf \
+            --archive venv-current.tar.gz#venv \
+            --jobconf mapred.line.input.format.linespermap=8000 \
+            --jobconf mapreduce.job.queuename=extraction \
+            --jobconf mapred.task.timeout=3600000 \
+            {input_cdx}
+            """.format(hbase_host=HBASE_HOST, env=args.env,
+                input_cdx=args.input_cdx,
+                grobid_uri=GROBID_URI)
+    subprocess.call(cmd, shell=True)
+
+def run_rowcount(args):
+    if args.rebuild:
+        rebuild_scalding()
+    print("Starting rowcount job...")
+    output = "{}/output-{}/{}-rowcount".format(
+        HDFS_DIR,
+        args.env,
+        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+    cmd = """hadoop jar \
+        scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+        com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob \
+        --hdfs \
+        --app.conf.path scalding/ia_cluster.conf \
+        --output {}""".format(output)
+    subprocess.call(cmd, shell=True)
+
+def run_statuscount(args):
+    if args.rebuild:
+        rebuild_scalding()
+    print("Starting statuscount job...")
+    output = "{}/output-{}/{}-statuscount".format(
+        HDFS_DIR,
+        args.env,
+        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
+    cmd = """hadoop jar \
+        scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
+        com.twitter.scalding.Tool sandcrawler.HBaseStatusCountJob \
+        --hdfs \
+        --app.conf.path scalding/ia_cluster.conf \
+        --output {}""".format(output)
+    subprocess.call(cmd, shell=True)
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument('--prod',
+        help="run against prod HBase table",
+        action='store_true')
+    parser.add_argument('--qa',
+        help="run against qa HBase table",
+        action='store_true')
+    parser.add_argument('--rebuild',
+        help="rebuild whatever artifact gets sent",
+        action='store_true')
+    subparsers = parser.add_subparsers()
+
+    sub_backfill = subparsers.add_parser('backfill')
+    sub_backfill.set_defaults(func=run_backfill)
+    sub_backfill.add_argument('input_cdx',
+        help="full HDFS path of CDX file to backfill")
+
+    sub_extract = subparsers.add_parser('extract')
+    sub_extract.set_defaults(func=run_extract)
+    sub_extract.add_argument('input_cdx',
+        help="full HDFS path of CDX file to extract")
+
+    sub_rowcount = subparsers.add_parser('row-count')
+    sub_rowcount.set_defaults(func=run_rowcount)
+
+    sub_statuscount = subparsers.add_parser('status-count')
+    sub_statuscount.set_defaults(func=run_statuscount)
+
+    args = parser.parse_args()
+    if not args.__dict__.get("func"):
+        print("tell me what to do! (try --help)")
+        sys.exit(-1)
+    if not (args.prod or args.qa) or (args.prod and args.qa):
+        print("must pass one of --prod or --qa")
+    if args.prod:
+        args.env = "prod"
+    if args.qa:
+        args.env = "qa"
+
+    args.func(args)
+
+if __name__ == '__main__':
+    main()
diff --git a/scalding/README.md b/scalding/README.md
index c40da5c..45b62d0 100644
--- a/scalding/README.md
+++ b/scalding/README.md
@@ -3,12 +3,19 @@ the JVM) using the Scalding framework.
 
 See the other markdown files in this directory for more background and tips.
 
-## Building and Running
+## Dependencies
 
 Locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build tool, and
 might need (exactly) Scala version 2.11.8.
 
-See section below on building and installing custom SpyGlass jar.
+On a debian/ubuntu machine:
+
+    echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+    sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
+    sudo apt-get update
+    sudo apt install scala sbt
+
+## Building and Running
 
 Run tests:
 
@@ -26,17 +33,12 @@ Run on cluster:
         com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \
         --app.conf.path thing.conf \
         --output hdfs:///user/bnewbold/spyglass_out_test 
-        
+
+## Troubleshooting
+
 If your `sbt` task fails with this error:
 
      java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: Metaspace
 
 try restarting `sbt` with more memory (e.g., `sbt -mem 2048`).
 
-## SpyGlass Jar
-
-SpyGlass is a "scalding-to-HBase" connector. It isn't maintained, so we needed
-to rebuild to support our versions of HBase/scalding/etc. Our fork (including
-build instructions) is at <https://github.com/bnewbold/SpyGlass>
-(`bnewbold-scala2.11` branch); compiled .jar files are available from
-<https://archive.org/download/ia_sandcrawler_maven2>.
diff --git a/scalding/ia_cluster.conf b/scalding/ia_cluster.conf
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scalding/ia_cluster.conf
diff --git a/scalding/scalding-background.md b/scalding/scalding-background.md
index 99b363a..4d62c7e 100644
--- a/scalding/scalding-background.md
+++ b/scalding/scalding-background.md
@@ -1,4 +1,11 @@
 
+## Why Scalding
+
+Scalding vs. Java (MapReduce) vs. Java (Cascading) vs. Scoobi vs. Scrunch:
+
+- <https://speakerdeck.com/agemooij/why-hadoop-mapreduce-needs-scala?slide=34>
+- <https://github.com/twitter/scalding/wiki/Comparison-to-Scrunch-and-Scoobi>
+
 ## Tips/Gotchas
 
 `.scala` file names should match internal classes.