diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | README.md | 2 | ||||
-rw-r--r-- | mapreduce/README.md | 7 | ||||
-rwxr-xr-x | please | 47 | ||||
-rw-r--r-- | scalding/README.md | 22 | ||||
-rw-r--r-- | scalding/ia_cluster.conf | 0 |
6 files changed, 41 insertions, 38 deletions
@@ -2,6 +2,7 @@ mapreduce-*.tar.gz *,cover htmlcov/ +mapreduce/venv-current.tar.gz *.o *.a @@ -20,7 +20,7 @@ Pretty much everything here uses python/pipenv. To setup your environment for this, and python in general: # libjpeg-dev is for some wayback/pillow stuff - sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essentials + sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential pip3 install --user pipenv On macOS: diff --git a/mapreduce/README.md b/mapreduce/README.md index b63e84b..aebc160 100644 --- a/mapreduce/README.md +++ b/mapreduce/README.md @@ -33,6 +33,7 @@ running on a devbox and GROBID running on a dedicated machine: Running from the cluster: # Create tarball of virtualenv + export PIPENV_VENV_IN_PROJECT=1 pipenv shell export VENVSHORT=`basename $VIRTUAL_ENV` tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT . @@ -60,9 +61,9 @@ Actual invocation to run on Hadoop cluster (running on an IA devbox, where hadoop environment is configured): # Create tarball of virtualenv - pipenv shell - export VENVSHORT=`basename $VIRTUAL_ENV` - tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT . + export PIPENV_VENV_IN_PROJECT=1 + pipenv install --deploy + tar -czf venv-current.tar.gz -C .venv . ./backfill_hbase_from_cdx.py \ --hbase-host wbgrp-svc263.us.archive.org \ @@ -11,44 +11,42 @@ import argparse import subprocess from datetime import datetime -HDFS_OUT_DIR = "/user/bnewbold/sandcrawler/out" +HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler" HBASE_HOST = "wbgrp-svc263.us.archive.org" def run_backfill(args): - output = "hdfs://{}/{}/{}-backfill".format( - HDFS_OUT_DIR, + print("Starting backfill job...") + output = "{}/output-{}/{}-backfill".format( + HDFS_DIR, args.env, - datetime.strftime(datetime.now(), "%Y-%m-%d-%H:%M:%S")) - cmd = """hadoop jar \ - scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \ - com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \ - --app.conf.path scalding/ia_cluster.conf \ - --output hdfs://{}""".format(output) - - + datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S")) cmd = """cd mapreduce; - pipenv shell - export VENVSHORT=`basename $VIRTUAL_ENV` - ./backfill_hbase_from_cdx.py \ - --hbase-host {HBASE_HOST} \ - --hbase-table wbgrp-journal-extract-0-{args.env} \ + export PIPENV_VENV_IN_PROJECT=1; + pipenv install --deploy + tar -czf venv-current.tar.gz -C .venv . + pipenv run ./backfill_hbase_from_cdx.py \ + --hbase-host {hbase_host} \ + --hbase-table wbgrp-journal-extract-0-{env} \ -r hadoop \ -c mrjob.conf \ - --archive $VENVSHORT.tar.gz#venv \ - {args.input_cdx} - """.format() + --archive venv-current.tar.gz#venv \ + {input_cdx} + """.format(hbase_host=HBASE_HOST, env=args.env, + input_cdx=args.input_cdx) subprocess.call(cmd, shell=True) def run_rowcount(args): - output = "hdfs://{}/{}/{}-rowcount".format( - HDFS_OUT_DIR, + print("Starting rowcount job...") + output = "{}/output-{}/{}-rowcount".format( + HDFS_DIR, args.env, - datetime.strftime(datetime.now(), "%Y-%m-%d-%H:%M:%S")) + datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S")) cmd = """hadoop jar \ scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \ - com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \ + com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob \ + --hdfs \ --app.conf.path scalding/ia_cluster.conf \ - --output hdfs://{}""".format(output) + --output {}""".format(output) subprocess.call(cmd, shell=True) def main(): @@ -80,6 +78,7 @@ def main(): args.env = "prod" if args.qa: args.env = "qa" + args.func(args) if __name__ == '__main__': diff --git a/scalding/README.md b/scalding/README.md index c40da5c..45b62d0 100644 --- a/scalding/README.md +++ b/scalding/README.md @@ -3,12 +3,19 @@ the JVM) using the Scalding framework. See the other markdown files in this directory for more background and tips. -## Building and Running +## Dependencies Locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build tool, and might need (exactly) Scala version 2.11.8. -See section below on building and installing custom SpyGlass jar. +On a debian/ubuntu machine: + + echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list + sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 + sudo apt-get update + sudo apt install scala sbt + +## Building and Running Run tests: @@ -26,17 +33,12 @@ Run on cluster: com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \ --app.conf.path thing.conf \ --output hdfs:///user/bnewbold/spyglass_out_test - + +## Troubleshooting + If your `sbt` task fails with this error: java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: Metaspace try restarting `sbt` with more memory (e.g., `sbt -mem 2048`). -## SpyGlass Jar - -SpyGlass is a "scalding-to-HBase" connector. It isn't maintained, so we needed -to rebuild to support our versions of HBase/scalding/etc. Our fork (including -build instructions) is at <https://github.com/bnewbold/SpyGlass> -(`bnewbold-scala2.11` branch); compiled .jar files are available from -<https://archive.org/download/ia_sandcrawler_maven2>. diff --git a/scalding/ia_cluster.conf b/scalding/ia_cluster.conf new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scalding/ia_cluster.conf |