diff options
| -rw-r--r-- | .gitignore | 1 | ||||
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | mapreduce/README.md | 7 | ||||
| -rwxr-xr-x | please | 152 | ||||
| -rw-r--r-- | scalding/README.md | 22 | ||||
| -rw-r--r-- | scalding/ia_cluster.conf | 0 | ||||
| -rw-r--r-- | scalding/scalding-background.md | 7 | 
7 files changed, 177 insertions, 14 deletions
| @@ -2,6 +2,7 @@  mapreduce-*.tar.gz  *,cover  htmlcov/ +mapreduce/venv-current.tar.gz  *.o  *.a @@ -20,7 +20,7 @@ Pretty much everything here uses python/pipenv. To setup your environment for  this, and python in general:      # libjpeg-dev is for some wayback/pillow stuff -    sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essentials +    sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential      pip3 install --user pipenv  On macOS: diff --git a/mapreduce/README.md b/mapreduce/README.md index b63e84b..aebc160 100644 --- a/mapreduce/README.md +++ b/mapreduce/README.md @@ -33,6 +33,7 @@ running on a devbox and GROBID running on a dedicated machine:  Running from the cluster:      # Create tarball of virtualenv +    export PIPENV_VENV_IN_PROJECT=1      pipenv shell      export VENVSHORT=`basename $VIRTUAL_ENV`      tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT . @@ -60,9 +61,9 @@ Actual invocation to run on Hadoop cluster (running on an IA devbox, where  hadoop environment is configured):      # Create tarball of virtualenv -    pipenv shell -    export VENVSHORT=`basename $VIRTUAL_ENV` -    tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT . +    export PIPENV_VENV_IN_PROJECT=1 +    pipenv install --deploy +    tar -czf venv-current.tar.gz -C .venv .      ./backfill_hbase_from_cdx.py \          --hbase-host wbgrp-svc263.us.archive.org \ @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +""" +Helper script for running Sandcrawler (journal pipeline) tasks in production. + +This is basically a Makefile. Be sure to only use python3 standard library +modules, so there are no dependencies. +""" + +import sys +import argparse +import subprocess +from datetime import datetime + +HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler" +HBASE_HOST = "wbgrp-svc263.us.archive.org" +GROBID_URI = "http://wbgrp-svc096.us.archive.org:8070" + +def rebuild_python(): +    print("Rebuilding python venv...") +    cmd = """cd mapreduce; +        export PIPENV_VENV_IN_PROJECT=1; +        pipenv install --deploy +        tar -czf venv-current.tar.gz -C .venv .""" +    subprocess.call(cmd, shell=True) + +def rebuild_scalding(): +    print("Rebuilding scalding jar...") +    cmd = """cd scalding; sbt assembly""" +    subprocess.call(cmd, shell=True) + +def run_backfill(args): +    if args.rebuild: +        rebuild_python() +    print("Starting backfill job...") +    output = "{}/output-{}/{}-backfill".format( +        HDFS_DIR, +        args.env, +        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S")) +    cmd = """cd mapreduce; +        pipenv run ./backfill_hbase_from_cdx.py \ +            --hbase-host {hbase_host} \ +            --hbase-table wbgrp-journal-extract-0-{env} \ +            -r hadoop \ +            -c mrjob.conf \ +            --archive venv-current.tar.gz#venv \ +            {input_cdx} +            """.format(hbase_host=HBASE_HOST, env=args.env, +                input_cdx=args.input_cdx) +    subprocess.call(cmd, shell=True) + +def run_extract(args): +    if args.rebuild: +        rebuild_python() +    print("Starting extract job...") +    output = "{}/output-{}/{}-extract".format( +        HDFS_DIR, +        args.env, +        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S")) +    cmd = """cd mapreduce; +        pipenv run ./extraction_cdx_grobid.py \ +            --hbase-host {hbase_host} \ +            --hbase-table wbgrp-journal-extract-0-{env} \ +            --grobid-uri {grobid_uri} \ +            -r hadoop \ +            -c mrjob.conf \ +            --archive venv-current.tar.gz#venv \ +            --jobconf mapred.line.input.format.linespermap=8000 \ +            --jobconf mapreduce.job.queuename=extraction \ +            --jobconf mapred.task.timeout=3600000 \ +            {input_cdx} +            """.format(hbase_host=HBASE_HOST, env=args.env, +                input_cdx=args.input_cdx, +                grobid_uri=GROBID_URI) +    subprocess.call(cmd, shell=True) + +def run_rowcount(args): +    if args.rebuild: +        rebuild_scalding() +    print("Starting rowcount job...") +    output = "{}/output-{}/{}-rowcount".format( +        HDFS_DIR, +        args.env, +        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S")) +    cmd = """hadoop jar \ +        scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \ +        com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob \ +        --hdfs \ +        --app.conf.path scalding/ia_cluster.conf \ +        --output {}""".format(output) +    subprocess.call(cmd, shell=True) + +def run_statuscount(args): +    if args.rebuild: +        rebuild_scalding() +    print("Starting statuscount job...") +    output = "{}/output-{}/{}-statuscount".format( +        HDFS_DIR, +        args.env, +        datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S")) +    cmd = """hadoop jar \ +        scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \ +        com.twitter.scalding.Tool sandcrawler.HBaseStatusCountJob \ +        --hdfs \ +        --app.conf.path scalding/ia_cluster.conf \ +        --output {}""".format(output) +    subprocess.call(cmd, shell=True) + +def main(): +    parser = argparse.ArgumentParser() + +    parser.add_argument('--prod', +        help="run against prod HBase table", +        action='store_true') +    parser.add_argument('--qa', +        help="run against qa HBase table", +        action='store_true') +    parser.add_argument('--rebuild', +        help="rebuild whatever artifact gets sent", +        action='store_true') +    subparsers = parser.add_subparsers() + +    sub_backfill = subparsers.add_parser('backfill') +    sub_backfill.set_defaults(func=run_backfill) +    sub_backfill.add_argument('input_cdx', +        help="full HDFS path of CDX file to backfill") + +    sub_extract = subparsers.add_parser('extract') +    sub_extract.set_defaults(func=run_extract) +    sub_extract.add_argument('input_cdx', +        help="full HDFS path of CDX file to extract") + +    sub_rowcount = subparsers.add_parser('row-count') +    sub_rowcount.set_defaults(func=run_rowcount) + +    sub_statuscount = subparsers.add_parser('status-count') +    sub_statuscount.set_defaults(func=run_statuscount) + +    args = parser.parse_args() +    if not args.__dict__.get("func"): +        print("tell me what to do! (try --help)") +        sys.exit(-1) +    if not (args.prod or args.qa) or (args.prod and args.qa): +        print("must pass one of --prod or --qa") +    if args.prod: +        args.env = "prod" +    if args.qa: +        args.env = "qa" + +    args.func(args) + +if __name__ == '__main__': +    main() diff --git a/scalding/README.md b/scalding/README.md index c40da5c..45b62d0 100644 --- a/scalding/README.md +++ b/scalding/README.md @@ -3,12 +3,19 @@ the JVM) using the Scalding framework.  See the other markdown files in this directory for more background and tips. -## Building and Running +## Dependencies  Locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build tool, and  might need (exactly) Scala version 2.11.8. -See section below on building and installing custom SpyGlass jar. +On a debian/ubuntu machine: + +    echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list +    sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823 +    sudo apt-get update +    sudo apt install scala sbt + +## Building and Running  Run tests: @@ -26,17 +33,12 @@ Run on cluster:          com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \          --app.conf.path thing.conf \          --output hdfs:///user/bnewbold/spyglass_out_test  -         + +## Troubleshooting +  If your `sbt` task fails with this error:       java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: Metaspace  try restarting `sbt` with more memory (e.g., `sbt -mem 2048`). -## SpyGlass Jar - -SpyGlass is a "scalding-to-HBase" connector. It isn't maintained, so we needed -to rebuild to support our versions of HBase/scalding/etc. Our fork (including -build instructions) is at <https://github.com/bnewbold/SpyGlass> -(`bnewbold-scala2.11` branch); compiled .jar files are available from -<https://archive.org/download/ia_sandcrawler_maven2>. diff --git a/scalding/ia_cluster.conf b/scalding/ia_cluster.conf new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scalding/ia_cluster.conf diff --git a/scalding/scalding-background.md b/scalding/scalding-background.md index 99b363a..4d62c7e 100644 --- a/scalding/scalding-background.md +++ b/scalding/scalding-background.md @@ -1,4 +1,11 @@ +## Why Scalding + +Scalding vs. Java (MapReduce) vs. Java (Cascading) vs. Scoobi vs. Scrunch: + +- <https://speakerdeck.com/agemooij/why-hadoop-mapreduce-needs-scala?slide=34> +- <https://github.com/twitter/scalding/wiki/Comparison-to-Scrunch-and-Scoobi> +  ## Tips/Gotchas  `.scala` file names should match internal classes. | 
