diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-04-14 00:24:46 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-04-19 20:29:17 +0200 |
commit | 9a66d7c4896f3415816d2df97bba9a01ac0ebf0c (patch) | |
tree | 2371a84935e92044ab024e7b849f00265c4ab2b9 | |
parent | c7b6745dd75fdf4b0e636f39f2bc256da5231195 (diff) | |
download | refcat-9a66d7c4896f3415816d2df97bba9a01ac0ebf0c.tar.gz refcat-9a66d7c4896f3415816d2df97bba9a01ac0ebf0c.zip |
update notes
-rwxr-xr-x | fetch_hadoop.sh | 37 | ||||
-rw-r--r-- | python/notes/version_3.md | 22 |
2 files changed, 59 insertions, 0 deletions
diff --git a/fetch_hadoop.sh b/fetch_hadoop.sh new file mode 100755 index 0000000..4c16dd3 --- /dev/null +++ b/fetch_hadoop.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# This script was originally only for pig scripts; now it can also be used to +# run scalding code locally (via please) + +set -euo pipefail + +#PIG_VERSION="0.12.0-cdh5.2.0" +# Using more recent version to work around snappy classpath problem +PIG_VERSION="0.17.0" +HADOOP_VERSION="2.6.0-cdh5.14.4" + +mkdir -p pig/deps/ +cd pig/deps/ + +# Fetch Hadoop Command +echo https://archive.cloudera.com/cdh5/cdh/5/hadoop-${HADOOP_VERSION}.tar.gz +wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz +#wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${HADOOP_VERSION}.tar.gz +#wget -c https://archive.org/serve/hadoop_pig_mirror/hadoop-${HADOOP_VERSION}.tar.gz +echo "Extracting Hadoop (takes a minute)..." +tar xvf hadoop-${HADOOP_VERSION}.tar.gz > /dev/null +ln -fs hadoop-${HADOOP_VERSION} hadoop + +# Fetch Pig +wget -c https://archive.cloudera.com/cdh5/cdh/5/pig-${PIG_VERSION}.tar.gz +#wget -c http://mirror.metrocast.net/apache/pig/pig-${PIG_VERSION}/pig-${PIG_VERSION}.tar.gz +#wget -c https://archive.org/serve/hadoop_pig_mirror/pig-${PIG_VERSION}.tar.gz +echo "Extracting Pig (takes a minute)..." +tar xvf pig-${PIG_VERSION}.tar.gz > /dev/null +ln -fs pig-${PIG_VERSION} pig + +# No 'readlink -f' on macOS +# https://stackoverflow.com/a/24572274/4682349 +JAVA_HOME=$(perl -MCwd -e 'print Cwd::abs_path shift' /usr/bin/java | sed "s:bin/java::") +./pig/bin/pig -x local -version +./hadoop/bin/hadoop version diff --git a/python/notes/version_3.md b/python/notes/version_3.md index 71d4dd1..891b61c 100644 --- a/python/notes/version_3.md +++ b/python/notes/version_3.md @@ -208,3 +208,25 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL A first run only got 64008 docs; improbable that we are missing so many doi. Also, need to generalize some skate code a bit. + +---- + +# Verification stats + +* have 40257623 clusters, `zstdcat -T0 /magna/refcat/RefsFatcatClusters/date-2021-02-20.json.zst | wc -l` +* have X cluster of size less than 10 + +``` +$ zstdcat -T0 /magna/refcat/RefsFatcatClusters/date-2021-02-20.json.zst | + jq -rc 'select(.v|length < 10)' | LC_ALL=C wc -l +``` + +A 5M sample. + +``` +$ awk '{print $3}' cluster_verify_5m.txt | sort | uniq -c | sort -nr +6886124 StatusDifferent +4619805 StatusStrong +3587478 StatusExact + 120215 StatusAmbiguous +``` |