aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-14 00:24:46 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-19 20:29:17 +0200
commit9a66d7c4896f3415816d2df97bba9a01ac0ebf0c (patch)
tree2371a84935e92044ab024e7b849f00265c4ab2b9
parentc7b6745dd75fdf4b0e636f39f2bc256da5231195 (diff)
downloadrefcat-9a66d7c4896f3415816d2df97bba9a01ac0ebf0c.tar.gz
refcat-9a66d7c4896f3415816d2df97bba9a01ac0ebf0c.zip
update notes
-rwxr-xr-xfetch_hadoop.sh37
-rw-r--r--python/notes/version_3.md22
2 files changed, 59 insertions, 0 deletions
diff --git a/fetch_hadoop.sh b/fetch_hadoop.sh
new file mode 100755
index 0000000..4c16dd3
--- /dev/null
+++ b/fetch_hadoop.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# This script was originally only for pig scripts; now it can also be used to
+# run scalding code locally (via please)
+
+set -euo pipefail
+
+#PIG_VERSION="0.12.0-cdh5.2.0"
+# Using more recent version to work around snappy classpath problem
+PIG_VERSION="0.17.0"
+HADOOP_VERSION="2.6.0-cdh5.14.4"
+
+mkdir -p pig/deps/
+cd pig/deps/
+
+# Fetch Hadoop Command
+echo https://archive.cloudera.com/cdh5/cdh/5/hadoop-${HADOOP_VERSION}.tar.gz
+wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
+#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz
+#wget -c https://archive.org/serve/hadoop_pig_mirror/hadoop-${HADOOP_VERSION}.tar.gz
+echo "Extracting Hadoop (takes a minute)..."
+tar xvf hadoop-${HADOOP_VERSION}.tar.gz > /dev/null
+ln -fs hadoop-${HADOOP_VERSION} hadoop
+
+# Fetch Pig
+wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz
+#wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz
+#wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz
+echo "Extracting Pig (takes a minute)..."
+tar xvf pig-${PIG_VERSION}.tar.gz > /dev/null
+ln -fs pig-${PIG_VERSION} pig
+
+# No 'readlink -f' on macOS
+# https://stackoverflow.com/a/24572274/4682349
+JAVA_HOME=$(perl -MCwd -e 'print Cwd::abs_path shift' /usr/bin/java | sed "s:bin/java::")
+./pig/bin/pig -x local -version
+./hadoop/bin/hadoop version
diff --git a/python/notes/version_3.md b/python/notes/version_3.md
index 71d4dd1..891b61c 100644
--- a/python/notes/version_3.md
+++ b/python/notes/version_3.md
@@ -208,3 +208,25 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL
A first run only got 64008 docs; improbable that we are missing so many doi.
Also, need to generalize some skate code a bit.
+
+----
+
+# Verification stats
+
+* have 40257623 clusters, `zstdcat -T0 /magna/refcat/RefsFatcatClusters/date-2021-02-20.json.zst | wc -l`
+* have X cluster of size less than 10
+
+```
+$ zstdcat -T0 /magna/refcat/RefsFatcatClusters/date-2021-02-20.json.zst |
+ jq -rc 'select(.v|length < 10)' | LC_ALL=C wc -l
+```
+
+A 5M sample.
+
+```
+$ awk '{print $3}' cluster_verify_5m.txt | sort | uniq -c | sort -nr
+6886124 StatusDifferent
+4619805 StatusStrong
+3587478 StatusExact
+ 120215 StatusAmbiguous
+```