aboutsummaryrefslogtreecommitdiffstats
path: root/extra/wikipedia/stats_enwiki_20210801.txt
blob: 9acfc830bf0e0da81f4701f312a61996c7125d55 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104

export CITEFILE=enwiki-20210801-pages-articles.citations.json.gz

# total number of articles processed
zcat $CITEFILE | wc -l
=> 6,348,910

# articles with one or more refs
zcat $CITEFILE | rg '"CitationClass"' | wc -l
=> 4,255,940

# total number of refs
zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l
=> 36,057,931

# refs by type
zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr
 21257548 web
  6162933 news
  3984831 book
  2620278 journal
   756082 citation
   379975 harvnb
   105212 gazette
    99427 pressrelease
    84036 nhle
    78761 encyclopedia
    71774 gnis
    70308 nrisref
    67731 report
    48090 episode
    43060 geonet3
    41776 map
    40904 AV-media-notes
    40140 season
    28051 AV-media
    22182 thesis
    17891 conference
    10420 england
     7798 interview
     5100 sports-reference
     3332 podcast
     2557 arxiv
     1859 techreport
     1455 mailinglist
     1284 speech
      860 newsgroup
      837 sign
      657 serial
      567 DVD-notes
      215 policy

# identifiers present
zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr
 2825707 ISBN
 1859682 DOI
  980996 PMID
  520281 ISSN
  342543 PMC
  294828 BIBCODE
  261269 OCLC
  157287 JSTOR
   82960 ARXIV
   23376 MR
   20723 LCCN
    9518 ASIN
    7520 OL
    4845 ZBL
    4447 SSRN
     954 OSTI
     666 JFM
     425 USENETID
     151 ISMN
      95 RFC

# refs with URL, by type
zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
 21169067 web
  5743802 news
  1958773 book
  1165475 journal
   476587 citation
    98280 pressrelease
    62235 report
    60868 encyclopedia
    27680 map
    18742 AV-media
    15317 thesis
    11236 conference
    10666 episode
     6609 interview
     5100 sports-reference
     3328 podcast
     2431 AV-media-notes
     1446 mailinglist
     1361 techreport
     1039 speech
      821 newsgroup
      504 sign
      138 serial
       24 harvnb
       21 DVD-notes
       15 gazette
        2 gnis