From 84c38d29fc4c7d0dd2e61a5ee9d57ee0a87a61e3 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 4 Jun 2019 17:38:01 -0700 Subject: QA checks (for hash, extid duplication) --- extra/checks/.gitignore | 2 ++ extra/checks/check_extid.sh | 49 ++++++++++++++++++++++++++++++++++++++++++++ extra/checks/check_hashes.sh | 16 +++++++++++++++ extra/checks/check_issnl.sh | 15 ++++++++++++++ 4 files changed, 82 insertions(+) create mode 100644 extra/checks/.gitignore create mode 100755 extra/checks/check_extid.sh create mode 100755 extra/checks/check_hashes.sh create mode 100755 extra/checks/check_issnl.sh (limited to 'extra') diff --git a/extra/checks/.gitignore b/extra/checks/.gitignore new file mode 100644 index 00000000..431c3bbc --- /dev/null +++ b/extra/checks/.gitignore @@ -0,0 +1,2 @@ +*.txt +*.tsv diff --git a/extra/checks/check_extid.sh b/extra/checks/check_extid.sh new file mode 100755 index 00000000..f74f50b6 --- /dev/null +++ b/extra/checks/check_extid.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +EXTID_FILE=$1 + +zcat $EXTID_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > doi_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $4 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > pmid_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $5 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > pmcid_ident.tsv +zcat $EXTID_FILE \ + | awk '{print $6 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + > wikidata_ident.tsv + +# these identifiers aren't fixed-width, so we need to join (sigh) +cut -f1 doi_ident.tsv \ + | uniq -d \ + | join -t$'\t' - doi_ident.tsv \ + > doi_ident.dupes.tsv +cut -f1 pmid_ident.tsv \ + | uniq -d \ + | join -t$'\t' - pmid_ident.tsv \ + > pmid_ident.dupes.tsv +cut -f1 pmcid_ident.tsv \ + | uniq -d \ + | join -t$'\t' - pmcid_ident.tsv \ + > pmcid_ident.dupes.tsv +cut -f1 wikidata_ident.tsv \ + | uniq -d \ + | join -t$'\t' - wikidata_ident.tsv \ + > wikidata_ident.dupes.tsv + +wc -l doi_ident.dupes.tsv pmid_ident.dupes.tsv pmcid_ident.dupes.tsv wikidata_ident.dupes.tsv >> counts.txt + diff --git a/extra/checks/check_hashes.sh b/extra/checks/check_hashes.sh new file mode 100755 index 00000000..94102329 --- /dev/null +++ b/extra/checks/check_hashes.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +HASH_FILE=$1 + +zcat $HASH_FILE \ + | awk '{print $3 "\t" $1}' \ + | rg -v '^\t' \ + | sort -S 4G \ + | uniq -d -w 40 \ + > sha1_ident.dupes.tsv + +wc -l sha1_ident.dupes.tsv >> counts.txt diff --git a/extra/checks/check_issnl.sh b/extra/checks/check_issnl.sh new file mode 100755 index 00000000..333f747b --- /dev/null +++ b/extra/checks/check_issnl.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -e -u -o pipefail + +export LC_ALL=C + +CONTAINER_DUMP=$1 + +zcat $CONTAINER_DUMP \ + | jq '[.issnl, .ident] | @tsv' -r \ + | sort -S 4G \ + | uniq -d -w 9 \ + > issnl_ident.dupes.tsv + +wc -l issnl_ident.dupes.tsv >> counts.txt -- cgit v1.2.3