aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-08-24 12:05:39 -0700
committerBryan Newbold <bnewbold@archive.org>2018-08-24 12:05:41 -0700
commit92584ec4201ecc27af423cbff7b4bc1573edf175 (patch)
tree416d1fae8bd82af1f470a1be25c8763da042a5e4
parente81774a66980ba17c42380884f39aa61b54e5eef (diff)
downloadsandcrawler-92584ec4201ecc27af423cbff7b4bc1573edf175.tar.gz
sandcrawler-92584ec4201ecc27af423cbff7b4bc1573edf175.zip
rework fetch_hadoop script
Should work on macOS now, and fetches hadoop in addition to pig. Still requires wget (not installed by default on macOS).
-rw-r--r--.gitlab-ci.yml2
-rw-r--r--TODO3
-rwxr-xr-xfetch_hadoop.sh38
-rw-r--r--pig/README.md9
-rwxr-xr-xpig/fetch_deps.sh20
5 files changed, 45 insertions, 27 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 3970bbb..2ccf776 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,7 @@ test_scalding:
# Needs fixing
#test_pig:
# script:
+# - ./fetch_hadoop.sh
# - cd pig
-# - ./fetch_deps.sh
# - pipenv install --dev --deploy
# - JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::") pipenv run pytest
diff --git a/TODO b/TODO
index 5e9220b..821bd0e 100644
--- a/TODO
+++ b/TODO
@@ -8,8 +8,7 @@ pig:
- play with test image on older releases (eg, trusty)
- how to get argument (like --hbase-table) into mrjob.conf, or similar?
-- fix pig gitlab-ci tests (JAVA_HOME). also make fetch_deps *way* more quiet
-- sentry: https://github.com/getsentry/raven-python
+- fix pig gitlab-ci tests (JAVA_HOME)
potential helpers:
- https://github.com/martinblech/xmltodict
diff --git a/fetch_hadoop.sh b/fetch_hadoop.sh
new file mode 100755
index 0000000..633f8fa
--- /dev/null
+++ b/fetch_hadoop.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# This script was originally only for pig scripts; now it can also be used to
+# run scalding code locally (via please)
+
+set -euo pipefail
+
+#PIG_VERSION="0.12.0-cdh5.2.0"
+# Using more recent version to work around snappy classpath problem
+PIG_VERSION="0.17.0"
+HADOOP_VERSION="2.3.0-cdh5.0.1"
+
+mkdir -p pig/deps/
+cd pig/deps/
+
+# Fetch Hadoop Command
+echo https://archive.cloudera.com/cdh5/cdh/5/hadoop-${HADOOP_VERSION}.tar.gz
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
+wget -c https://archive.org/serve/hadoop_pig_mirror/hadoop-${HADOOP_VERSION}.tar.gz
+echo "Extracting Hadoop (takes a minute)..."
+tar xvf hadoop-${HADOOP_VERSION}.tar.gz > /dev/null
+ln -fs hadoop-${HADOOP_VERSION} hadoop
+
+# Fetch Pig
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
+#wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz
+wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz
+echo "Extracting Pig (takes a minute)..."
+tar xvf pig-${PIG_VERSION}.tar.gz > /dev/null
+ln -fs pig-${PIG_VERSION} pig
+
+# No 'readlink -f' on macOS
+# https://stackoverflow.com/a/24572274/4682349
+JAVA_HOME=$(perl -MCwd -e 'print Cwd::abs_path shift' /usr/bin/java | sed "s:bin/java::")
+./pig/bin/pig -x local -version
+./hadoop/bin/hadoop version
+
diff --git a/pig/README.md b/pig/README.md
index d14d2ae..df8ce68 100644
--- a/pig/README.md
+++ b/pig/README.md
@@ -12,12 +12,13 @@ by `fetch_deps.sh`) due to [dependency/jar issues][pig-bug] in local mode of
To run tests, you need Java installed and `JAVA_HOME` configured.
-Fetch dependencies (pig):
+Fetch dependencies (including pig) from top-level directory:
- ./fetch_deps.sh
+ ./fetch_hadoop.sh
-Write .pig scripts here, and add a pytho wrapper test to `./tests/` when done.
-Test vector files (input/output) can go in `./tests/files/`.
+Write `.pig` scripts in this directory, and add a python wrapper test to
+`./tests/` when done. Test vector files (input/output) can go in
+`./tests/files/`.
Run the tests with:
diff --git a/pig/fetch_deps.sh b/pig/fetch_deps.sh
deleted file mode 100755
index 4cefa5e..0000000
--- a/pig/fetch_deps.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env bash
-
-set -euo pipefail
-
-#PIG_VERSION="0.12.0-cdh5.2.0"
-# Using more recent version to work around snappy classpath problem
-PIG_VERSION="0.17.0"
-JAVA_HOME=$(readlink -f /usr/bin/java | sed "s:bin/java::")
-
-mkdir -p deps/
-cd deps/
-
-# Fetch Pig
-#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
-#wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz
-wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz
-tar xvf pig-${PIG_VERSION}.tar.gz
-ln -fs pig-${PIG_VERSION} pig
-./pig/bin/pig -x local -version
-