blob: 05539796535792122b6a7d45c51f264a56183e87 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
export CITEFILE=enwiki-20211201-pages-articles.citations.json.gz
# total number of articles processed
zcat $CITEFILE | wc -l
=> 6,416,542
# articles with one or more refs
zcat $CITEFILE | rg '"CitationClass"' | wc -l
=> 4,389,394
# total number of refs
zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l
=> 37,724,219
# refs by type
zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr
22379651 web
6430930 news
4124318 book
2699528 journal
770319 citation
388893 harvnb
106646 gazette
102752 pressrelease
86908 nhle
82366 encyclopedia
72401 gnis
71358 report
70419 nrisref
49233 episode
43692 AV-media-notes
43058 geonet3
42300 map
41957 season
36349 AV-media
24498 thesis
18723 conference
10591 england
8283 interview
5084 sports-reference
3567 podcast
2528 arxiv
1915 techreport
1466 mailinglist
1325 speech
869 newsgroup
864 sign
647 serial
566 DVD-notes
215 policy
# identifiers present
zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr
2925108 ISBN
1932505 DOI
1010556 PMID
557313 ISSN
356460 PMC
320414 BIBCODE
275283 OCLC
166861 JSTOR
85286 ARXIV
23600 MR
21797 LCCN
9927 ASIN
7814 OL
4878 ZBL
4556 SSRN
1071 OSTI
672 JFM
424 USENETID
152 ISMN
86 RFC
# refs with URL, by type
zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
22291836 web
5995008 news
2033769 book
1217383 journal
487205 citation
101594 pressrelease
65505 report
63903 encyclopedia
28179 map
26538 AV-media
17167 thesis
11806 conference
10924 episode
7042 interview
5084 sports-reference
3562 podcast
2484 AV-media-notes
1458 mailinglist
1416 techreport
1072 speech
831 newsgroup
515 sign
139 serial
24 harvnb
22 DVD-notes
15 gazette
# refs with ArchiveURL, by type
zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
4738008 web
1089407 news
111550 journal
79472 book
76758 citation
33111 pressrelease
12318 report
8019 encyclopedia
6061 AV-media
5068 map
4887 sports-reference
2716 conference
2138 interview
1915 episode
1716 thesis
578 podcast
400 AV-media-notes
308 speech
257 mailinglist
244 techreport
149 newsgroup
32 sign
10 serial
7 DVD-notes
4 nhle
3 england
2 gazette
1 gnis
# top ref URL hosts (not domains, but leading 'www' removed)
zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .URL' -r | cut -f3 -d/ | sed 's/^www\.//g' | sort | uniq -c | sort -nr | head -n30
1266572 books.google.com
552597 archive.org
372440 nytimes.com
361046 ncbi.nlm.nih.gov
275580 bbc.co.uk
261514 newspapers.com
245562 theguardian.com
214278 billboard.com
197563 youtube.com
190427 news.bbc.co.uk
160735 census.gov
143304 news.google.com
125024 allmusic.com
110937 nla.gov.au
105348 washingtonpost.com
93509 telegraph.co.uk
85782 bbc.com
82966 espncricinfo.com
82789 timesofindia.indiatimes.com
81518 variety.com
77786 imdb.com
76921 independent.co.uk
73717 baseball-reference.com
72244 deadline.com
67088 animenewsnetwork.com
64784 sports-reference.com
63994 reuters.com
63394 hollywoodreporter.com
60720 thehindu.com
58972 tvbythenumbers.zap2it.com
[...]
|