export CITEFILE=enwiki-20211201-pages-articles.citations.json.gz # total number of articles processed zcat $CITEFILE | wc -l => 6,416,542 # articles with one or more refs zcat $CITEFILE | rg '"CitationClass"' | wc -l => 4,389,394 # total number of refs zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l => 37,724,219 # refs by type zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr 22379651 web 6430930 news 4124318 book 2699528 journal 770319 citation 388893 harvnb 106646 gazette 102752 pressrelease 86908 nhle 82366 encyclopedia 72401 gnis 71358 report 70419 nrisref 49233 episode 43692 AV-media-notes 43058 geonet3 42300 map 41957 season 36349 AV-media 24498 thesis 18723 conference 10591 england 8283 interview 5084 sports-reference 3567 podcast 2528 arxiv 1915 techreport 1466 mailinglist 1325 speech 869 newsgroup 864 sign 647 serial 566 DVD-notes 215 policy # identifiers present zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr 2925108 ISBN 1932505 DOI 1010556 PMID 557313 ISSN 356460 PMC 320414 BIBCODE 275283 OCLC 166861 JSTOR 85286 ARXIV 23600 MR 21797 LCCN 9927 ASIN 7814 OL 4878 ZBL 4556 SSRN 1071 OSTI 672 JFM 424 USENETID 152 ISMN 86 RFC # refs with URL, by type zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr 22291836 web 5995008 news 2033769 book 1217383 journal 487205 citation 101594 pressrelease 65505 report 63903 encyclopedia 28179 map 26538 AV-media 17167 thesis 11806 conference 10924 episode 7042 interview 5084 sports-reference 3562 podcast 2484 AV-media-notes 1458 mailinglist 1416 techreport 1072 speech 831 newsgroup 515 sign 139 serial 24 harvnb 22 DVD-notes 15 gazette # refs with ArchiveURL, by type zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr 4738008 web 1089407 news 111550 journal 79472 book 76758 citation 33111 pressrelease 12318 report 8019 encyclopedia 6061 AV-media 5068 map 4887 sports-reference 2716 conference 2138 interview 1915 episode 1716 thesis 578 podcast 400 AV-media-notes 308 speech 257 mailinglist 244 techreport 149 newsgroup 32 sign 10 serial 7 DVD-notes 4 nhle 3 england 2 gazette 1 gnis # top ref URL hosts (not domains, but leading 'www' removed) zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .URL' -r | cut -f3 -d/ | sed 's/^www\.//g' | sort | uniq -c | sort -nr | head -n30 1266572 books.google.com 552597 archive.org 372440 nytimes.com 361046 ncbi.nlm.nih.gov 275580 bbc.co.uk 261514 newspapers.com 245562 theguardian.com 214278 billboard.com 197563 youtube.com 190427 news.bbc.co.uk 160735 census.gov 143304 news.google.com 125024 allmusic.com 110937 nla.gov.au 105348 washingtonpost.com 93509 telegraph.co.uk 85782 bbc.com 82966 espncricinfo.com 82789 timesofindia.indiatimes.com 81518 variety.com 77786 imdb.com 76921 independent.co.uk 73717 baseball-reference.com 72244 deadline.com 67088 animenewsnetwork.com 64784 sports-reference.com 63994 reuters.com 63394 hollywoodreporter.com 60720 thehindu.com 58972 tvbythenumbers.zap2it.com [...]