export CITEFILE=enwiki-20210801-pages-articles.citations.json.gz # total number of articles processed zcat $CITEFILE | wc -l => 6,348,910 # articles with one or more refs zcat $CITEFILE | rg '"CitationClass"' | wc -l => 4,255,940 # total number of refs zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l => 36,057,931 # refs by type zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr 21257548 web 6162933 news 3984831 book 2620278 journal 756082 citation 379975 harvnb 105212 gazette 99427 pressrelease 84036 nhle 78761 encyclopedia 71774 gnis 70308 nrisref 67731 report 48090 episode 43060 geonet3 41776 map 40904 AV-media-notes 40140 season 28051 AV-media 22182 thesis 17891 conference 10420 england 7798 interview 5100 sports-reference 3332 podcast 2557 arxiv 1859 techreport 1455 mailinglist 1284 speech 860 newsgroup 837 sign 657 serial 567 DVD-notes 215 policy # identifiers present zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr 2825707 ISBN 1859682 DOI 980996 PMID 520281 ISSN 342543 PMC 294828 BIBCODE 261269 OCLC 157287 JSTOR 82960 ARXIV 23376 MR 20723 LCCN 9518 ASIN 7520 OL 4845 ZBL 4447 SSRN 954 OSTI 666 JFM 425 USENETID 151 ISMN 95 RFC # refs with URL, by type zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr 21169067 web 5743802 news 1958773 book 1165475 journal 476587 citation 98280 pressrelease 62235 report 60868 encyclopedia 27680 map 18742 AV-media 15317 thesis 11236 conference 10666 episode 6609 interview 5100 sports-reference 3328 podcast 2431 AV-media-notes 1446 mailinglist 1361 techreport 1039 speech 821 newsgroup 504 sign 138 serial 24 harvnb 21 DVD-notes 15 gazette 2 gnis