aboutsummaryrefslogtreecommitdiffstats
path: root/extra/wikipedia/stats_enwiki_20211201.txt
blob: 05539796535792122b6a7d45c51f264a56183e87 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

export CITEFILE=enwiki-20211201-pages-articles.citations.json.gz

# total number of articles processed
zcat $CITEFILE | wc -l
=> 6,416,542

# articles with one or more refs
zcat $CITEFILE | rg '"CitationClass"' | wc -l
=> 4,389,394

# total number of refs
zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l
=> 37,724,219

# refs by type
zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr
   22379651 web
    6430930 news
    4124318 book
    2699528 journal
     770319 citation
     388893 harvnb
     106646 gazette
     102752 pressrelease
      86908 nhle
      82366 encyclopedia
      72401 gnis
      71358 report
      70419 nrisref
      49233 episode
      43692 AV-media-notes
      43058 geonet3
      42300 map
      41957 season
      36349 AV-media
      24498 thesis
      18723 conference
      10591 england
       8283 interview
       5084 sports-reference
       3567 podcast
       2528 arxiv
       1915 techreport
       1466 mailinglist
       1325 speech
        869 newsgroup
        864 sign
        647 serial
        566 DVD-notes
        215 policy

# identifiers present
zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr
    2925108 ISBN
    1932505 DOI
    1010556 PMID
     557313 ISSN
     356460 PMC
     320414 BIBCODE
     275283 OCLC
     166861 JSTOR
      85286 ARXIV
      23600 MR
      21797 LCCN
       9927 ASIN
       7814 OL
       4878 ZBL
       4556 SSRN
       1071 OSTI
        672 JFM
        424 USENETID
        152 ISMN
         86 RFC

# refs with URL, by type
zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
   22291836 web
    5995008 news
    2033769 book
    1217383 journal
     487205 citation
     101594 pressrelease
      65505 report
      63903 encyclopedia
      28179 map
      26538 AV-media
      17167 thesis
      11806 conference
      10924 episode
       7042 interview
       5084 sports-reference
       3562 podcast
       2484 AV-media-notes
       1458 mailinglist
       1416 techreport
       1072 speech
        831 newsgroup
        515 sign
        139 serial
         24 harvnb
         22 DVD-notes
         15 gazette

# refs with ArchiveURL, by type
zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
    4738008 web
    1089407 news
     111550 journal
      79472 book
      76758 citation
      33111 pressrelease
      12318 report
       8019 encyclopedia
       6061 AV-media
       5068 map
       4887 sports-reference
       2716 conference
       2138 interview
       1915 episode
       1716 thesis
        578 podcast
        400 AV-media-notes
        308 speech
        257 mailinglist
        244 techreport
        149 newsgroup
         32 sign
         10 serial
          7 DVD-notes
          4 nhle
          3 england
          2 gazette
          1 gnis

# top ref URL hosts (not domains, but leading 'www' removed)
zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .URL' -r | cut -f3 -d/ | sed 's/^www\.//g' | sort | uniq -c | sort -nr | head -n30
    1266572 books.google.com
     552597 archive.org
     372440 nytimes.com
     361046 ncbi.nlm.nih.gov
     275580 bbc.co.uk
     261514 newspapers.com
     245562 theguardian.com
     214278 billboard.com
     197563 youtube.com
     190427 news.bbc.co.uk
     160735 census.gov
     143304 news.google.com
     125024 allmusic.com
     110937 nla.gov.au
     105348 washingtonpost.com
      93509 telegraph.co.uk
      85782 bbc.com
      82966 espncricinfo.com
      82789 timesofindia.indiatimes.com
      81518 variety.com
      77786 imdb.com
      76921 independent.co.uk
      73717 baseball-reference.com
      72244 deadline.com
      67088 animenewsnetwork.com
      64784 sports-reference.com
      63994 reuters.com
      63394 hollywoodreporter.com
      60720 thehindu.com
      58972 tvbythenumbers.zap2it.com
    [...]