blob: f74f50b638119443c0bf325158f2a6ae9a56f30c (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/usr/bin/env bash
set -e -u -o pipefail
export LC_ALL=C
EXTID_FILE=$1
zcat $EXTID_FILE \
| awk '{print $3 "\t" $1}' \
| rg -v '^\t' \
| sort -S 4G \
> doi_ident.tsv
zcat $EXTID_FILE \
| awk '{print $4 "\t" $1}' \
| rg -v '^\t' \
| sort -S 4G \
> pmid_ident.tsv
zcat $EXTID_FILE \
| awk '{print $5 "\t" $1}' \
| rg -v '^\t' \
| sort -S 4G \
> pmcid_ident.tsv
zcat $EXTID_FILE \
| awk '{print $6 "\t" $1}' \
| rg -v '^\t' \
| sort -S 4G \
> wikidata_ident.tsv
# these identifiers aren't fixed-width, so we need to join (sigh)
cut -f1 doi_ident.tsv \
| uniq -d \
| join -t$'\t' - doi_ident.tsv \
> doi_ident.dupes.tsv
cut -f1 pmid_ident.tsv \
| uniq -d \
| join -t$'\t' - pmid_ident.tsv \
> pmid_ident.dupes.tsv
cut -f1 pmcid_ident.tsv \
| uniq -d \
| join -t$'\t' - pmcid_ident.tsv \
> pmcid_ident.dupes.tsv
cut -f1 wikidata_ident.tsv \
| uniq -d \
| join -t$'\t' - wikidata_ident.tsv \
> wikidata_ident.dupes.tsv
wc -l doi_ident.dupes.tsv pmid_ident.dupes.tsv pmcid_ident.dupes.tsv wikidata_ident.dupes.tsv >> counts.txt
|