diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-04 17:38:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2019-06-04 17:38:01 -0700 |
commit | 84c38d29fc4c7d0dd2e61a5ee9d57ee0a87a61e3 (patch) | |
tree | b2ce9e75ed06d38216aae039ec12c76e8aaea838 /extra/checks/check_hashes.sh | |
parent | 4fdb056debb07beb36d9bfd0dd358cd398a61a6e (diff) | |
download | fatcat-84c38d29fc4c7d0dd2e61a5ee9d57ee0a87a61e3.tar.gz fatcat-84c38d29fc4c7d0dd2e61a5ee9d57ee0a87a61e3.zip |
QA checks (for hash, extid duplication)
Diffstat (limited to 'extra/checks/check_hashes.sh')
-rwxr-xr-x | extra/checks/check_hashes.sh | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/extra/checks/check_hashes.sh b/extra/checks/check_hashes.sh new file mode 100755 index 00000000..94102329 --- /dev/null +++ b/extra/checks/check_hashes.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +HASH_FILE=$1 + +zcat $HASH_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + | uniq -d -w 40 \ + > sha1_ident.dupes.tsv + +wc -l sha1_ident.dupes.tsv >> counts.txt |