diff options
author | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 15:02:27 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-11-29 15:02:27 -0800 |
commit | 7c6afa0a21883dc8037f3d021246db24eef39b41 (patch) | |
tree | 3fa7c1e595248a46e88ea62c2f9f70106186b0fe /extra/checks | |
parent | c32154f2875a7fb9aac727013e1475cdd811e180 (diff) | |
download | fatcat-7c6afa0a21883dc8037f3d021246db24eef39b41.tar.gz fatcat-7c6afa0a21883dc8037f3d021246db24eef39b41.zip |
clean up extra/ folder a bit
Diffstat (limited to 'extra/checks')
-rw-r--r-- | extra/checks/.gitignore | 2 | ||||
-rwxr-xr-x | extra/checks/check_extid.sh | 49 | ||||
-rwxr-xr-x | extra/checks/check_hashes.sh | 16 | ||||
-rwxr-xr-x | extra/checks/check_issnl.sh | 15 |
4 files changed, 0 insertions, 82 deletions
diff --git a/extra/checks/.gitignore b/extra/checks/.gitignore deleted file mode 100644 index 431c3bbc..00000000 --- a/extra/checks/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -*.txt -*.tsv diff --git a/extra/checks/check_extid.sh b/extra/checks/check_extid.sh deleted file mode 100755 index f74f50b6..00000000 --- a/extra/checks/check_extid.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -set -e -u -o pipefail - -export LC_ALL=C - -EXTID_FILE=$1 - -zcat $EXTID_FILE \ - | awk '{print $3 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > doi_ident.tsv -zcat $EXTID_FILE \ - | awk '{print $4 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > pmid_ident.tsv -zcat $EXTID_FILE \ - | awk '{print $5 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > pmcid_ident.tsv -zcat $EXTID_FILE \ - | awk '{print $6 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - > wikidata_ident.tsv - -# these identifiers aren't fixed-width, so we need to join (sigh) -cut -f1 doi_ident.tsv \ - | uniq -d \ - | join -t$'\t' - doi_ident.tsv \ - > doi_ident.dupes.tsv -cut -f1 pmid_ident.tsv \ - | uniq -d \ - | join -t$'\t' - pmid_ident.tsv \ - > pmid_ident.dupes.tsv -cut -f1 pmcid_ident.tsv \ - | uniq -d \ - | join -t$'\t' - pmcid_ident.tsv \ - > pmcid_ident.dupes.tsv -cut -f1 wikidata_ident.tsv \ - | uniq -d \ - | join -t$'\t' - wikidata_ident.tsv \ - > wikidata_ident.dupes.tsv - -wc -l doi_ident.dupes.tsv pmid_ident.dupes.tsv pmcid_ident.dupes.tsv wikidata_ident.dupes.tsv >> counts.txt - diff --git a/extra/checks/check_hashes.sh b/extra/checks/check_hashes.sh deleted file mode 100755 index 94102329..00000000 --- a/extra/checks/check_hashes.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash - -set -e -u -o pipefail - -export LC_ALL=C - -HASH_FILE=$1 - -zcat $HASH_FILE \ - | awk '{print $3 "\t" $1}' \ - | rg -v '^\t' \ - | sort -S 4G \ - | uniq -d -w 40 \ - > sha1_ident.dupes.tsv - -wc -l sha1_ident.dupes.tsv >> counts.txt diff --git a/extra/checks/check_issnl.sh b/extra/checks/check_issnl.sh deleted file mode 100755 index a28695e7..00000000 --- a/extra/checks/check_issnl.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -set -e -u -o pipefail - -export LC_ALL=C - -CONTAINER_DUMP=$1 - -zcat $CONTAINER_DUMP \ - | jq '[.issnl, .ident] | @tsv' -r \ - | sort -S 4G \ - | uniq -D -w 9 \ - > issnl_ident.dupes.tsv - -wc -l issnl_ident.dupes.tsv >> counts.txt |