aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore1
-rw-r--r--README.md2
-rw-r--r--mapreduce/README.md7
-rwxr-xr-xplease47
-rw-r--r--scalding/README.md22
-rw-r--r--scalding/ia_cluster.conf0
6 files changed, 41 insertions, 38 deletions
diff --git a/.gitignore b/.gitignore
index 8a9f43e..28d3c9f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
mapreduce-*.tar.gz
*,cover
htmlcov/
+mapreduce/venv-current.tar.gz
*.o
*.a
diff --git a/README.md b/README.md
index 90bfd75..e53e775 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Pretty much everything here uses python/pipenv. To setup your environment for
this, and python in general:
# libjpeg-dev is for some wayback/pillow stuff
- sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essentials
+ sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential
pip3 install --user pipenv
On macOS:
diff --git a/mapreduce/README.md b/mapreduce/README.md
index b63e84b..aebc160 100644
--- a/mapreduce/README.md
+++ b/mapreduce/README.md
@@ -33,6 +33,7 @@ running on a devbox and GROBID running on a dedicated machine:
Running from the cluster:
# Create tarball of virtualenv
+ export PIPENV_VENV_IN_PROJECT=1
pipenv shell
export VENVSHORT=`basename $VIRTUAL_ENV`
tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT .
@@ -60,9 +61,9 @@ Actual invocation to run on Hadoop cluster (running on an IA devbox, where
hadoop environment is configured):
# Create tarball of virtualenv
- pipenv shell
- export VENVSHORT=`basename $VIRTUAL_ENV`
- tar -czf $VENVSHORT.tar.gz -C /home/bnewbold/.local/share/virtualenvs/$VENVSHORT .
+ export PIPENV_VENV_IN_PROJECT=1
+ pipenv install --deploy
+ tar -czf venv-current.tar.gz -C .venv .
./backfill_hbase_from_cdx.py \
--hbase-host wbgrp-svc263.us.archive.org \
diff --git a/please b/please
index c5541b5..688a159 100755
--- a/please
+++ b/please
@@ -11,44 +11,42 @@ import argparse
import subprocess
from datetime import datetime
-HDFS_OUT_DIR = "/user/bnewbold/sandcrawler/out"
+HDFS_DIR = "hdfs:///user/bnewbold/sandcrawler"
HBASE_HOST = "wbgrp-svc263.us.archive.org"
def run_backfill(args):
- output = "hdfs://{}/{}/{}-backfill".format(
- HDFS_OUT_DIR,
+ print("Starting backfill job...")
+ output = "{}/output-{}/{}-backfill".format(
+ HDFS_DIR,
args.env,
- datetime.strftime(datetime.now(), "%Y-%m-%d-%H:%M:%S"))
- cmd = """hadoop jar \
- scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
- com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \
- --app.conf.path scalding/ia_cluster.conf \
- --output hdfs://{}""".format(output)
-
-
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
cmd = """cd mapreduce;
- pipenv shell
- export VENVSHORT=`basename $VIRTUAL_ENV`
- ./backfill_hbase_from_cdx.py \
- --hbase-host {HBASE_HOST} \
- --hbase-table wbgrp-journal-extract-0-{args.env} \
+ export PIPENV_VENV_IN_PROJECT=1;
+ pipenv install --deploy
+ tar -czf venv-current.tar.gz -C .venv .
+ pipenv run ./backfill_hbase_from_cdx.py \
+ --hbase-host {hbase_host} \
+ --hbase-table wbgrp-journal-extract-0-{env} \
-r hadoop \
-c mrjob.conf \
- --archive $VENVSHORT.tar.gz#venv \
- {args.input_cdx}
- """.format()
+ --archive venv-current.tar.gz#venv \
+ {input_cdx}
+ """.format(hbase_host=HBASE_HOST, env=args.env,
+ input_cdx=args.input_cdx)
subprocess.call(cmd, shell=True)
def run_rowcount(args):
- output = "hdfs://{}/{}/{}-rowcount".format(
- HDFS_OUT_DIR,
+ print("Starting rowcount job...")
+ output = "{}/output-{}/{}-rowcount".format(
+ HDFS_DIR,
args.env,
- datetime.strftime(datetime.now(), "%Y-%m-%d-%H:%M:%S"))
+ datetime.strftime(datetime.now(), "%Y-%m-%d-%H%M.%S"))
cmd = """hadoop jar \
scalding/target/scala-2.11/sandcrawler-assembly-0.2.0-SNAPSHOT.jar \
- com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \
+ com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob \
+ --hdfs \
--app.conf.path scalding/ia_cluster.conf \
- --output hdfs://{}""".format(output)
+ --output {}""".format(output)
subprocess.call(cmd, shell=True)
def main():
@@ -80,6 +78,7 @@ def main():
args.env = "prod"
if args.qa:
args.env = "qa"
+
args.func(args)
if __name__ == '__main__':
diff --git a/scalding/README.md b/scalding/README.md
index c40da5c..45b62d0 100644
--- a/scalding/README.md
+++ b/scalding/README.md
@@ -3,12 +3,19 @@ the JVM) using the Scalding framework.
See the other markdown files in this directory for more background and tips.
-## Building and Running
+## Dependencies
Locally, you need to have the JVM (eg, OpenJDK 1.8), `sbt` build tool, and
might need (exactly) Scala version 2.11.8.
-See section below on building and installing custom SpyGlass jar.
+On a debian/ubuntu machine:
+
+ echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
+ sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv 2EE0EA64E40A89B84B2DF73499E82A75642AC823
+ sudo apt-get update
+ sudo apt install scala sbt
+
+## Building and Running
Run tests:
@@ -26,17 +33,12 @@ Run on cluster:
com.twitter.scalding.Tool sandcrawler.HBaseRowCountJob --hdfs \
--app.conf.path thing.conf \
--output hdfs:///user/bnewbold/spyglass_out_test
-
+
+## Troubleshooting
+
If your `sbt` task fails with this error:
java.util.concurrent.ExecutionException: java.lang.OutOfMemoryError: Metaspace
try restarting `sbt` with more memory (e.g., `sbt -mem 2048`).
-## SpyGlass Jar
-
-SpyGlass is a "scalding-to-HBase" connector. It isn't maintained, so we needed
-to rebuild to support our versions of HBase/scalding/etc. Our fork (including
-build instructions) is at <https://github.com/bnewbold/SpyGlass>
-(`bnewbold-scala2.11` branch); compiled .jar files are available from
-<https://archive.org/download/ia_sandcrawler_maven2>.
diff --git a/scalding/ia_cluster.conf b/scalding/ia_cluster.conf
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scalding/ia_cluster.conf