aboutsummaryrefslogtreecommitdiffstats
path: root/extra/wikipedia/stats_enwiki_20210801.txt
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2021-12-18 00:18:46 +0000
committerMartin Czygan <martin@archive.org>2021-12-18 00:18:46 +0000
commit3c0cae2b81dbd4ff7621cf9b7e4a6183352984f0 (patch)
treed6289ab22ead66eb28190ff07e00af6ab0f35306 /extra/wikipedia/stats_enwiki_20210801.txt
parent3867fcab91244650a1e2fd9bba165a54c4e810e5 (diff)
parentfa557a90482cfed59564173e442d9375b959ee8b (diff)
downloadrefcat-3c0cae2b81dbd4ff7621cf9b7e4a6183352984f0.tar.gz
refcat-3c0cae2b81dbd4ff7621cf9b7e4a6183352984f0.zip
Merge branch 'bnewbold-wikipedia-notes' into 'master'
wikipedia refs prep notes, and stats from 20210801 run See merge request webgroup/refcat!5
Diffstat (limited to 'extra/wikipedia/stats_enwiki_20210801.txt')
-rw-r--r--extra/wikipedia/stats_enwiki_20210801.txt104
1 files changed, 104 insertions, 0 deletions
diff --git a/extra/wikipedia/stats_enwiki_20210801.txt b/extra/wikipedia/stats_enwiki_20210801.txt
new file mode 100644
index 0000000..9acfc83
--- /dev/null
+++ b/extra/wikipedia/stats_enwiki_20210801.txt
@@ -0,0 +1,104 @@
+
+export CITEFILE=enwiki-20210801-pages-articles.citations.json.gz
+
+# total number of articles processed
+zcat $CITEFILE | wc -l
+=> 6,348,910
+
+# articles with one or more refs
+zcat $CITEFILE | rg '"CitationClass"' | wc -l
+=> 4,255,940
+
+# total number of refs
+zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l
+=> 36,057,931
+
+# refs by type
+zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr
+ 21257548 web
+ 6162933 news
+ 3984831 book
+ 2620278 journal
+ 756082 citation
+ 379975 harvnb
+ 105212 gazette
+ 99427 pressrelease
+ 84036 nhle
+ 78761 encyclopedia
+ 71774 gnis
+ 70308 nrisref
+ 67731 report
+ 48090 episode
+ 43060 geonet3
+ 41776 map
+ 40904 AV-media-notes
+ 40140 season
+ 28051 AV-media
+ 22182 thesis
+ 17891 conference
+ 10420 england
+ 7798 interview
+ 5100 sports-reference
+ 3332 podcast
+ 2557 arxiv
+ 1859 techreport
+ 1455 mailinglist
+ 1284 speech
+ 860 newsgroup
+ 837 sign
+ 657 serial
+ 567 DVD-notes
+ 215 policy
+
+# identifiers present
+zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr
+ 2825707 ISBN
+ 1859682 DOI
+ 980996 PMID
+ 520281 ISSN
+ 342543 PMC
+ 294828 BIBCODE
+ 261269 OCLC
+ 157287 JSTOR
+ 82960 ARXIV
+ 23376 MR
+ 20723 LCCN
+ 9518 ASIN
+ 7520 OL
+ 4845 ZBL
+ 4447 SSRN
+ 954 OSTI
+ 666 JFM
+ 425 USENETID
+ 151 ISMN
+ 95 RFC
+
+# refs with URL, by type
+zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
+ 21169067 web
+ 5743802 news
+ 1958773 book
+ 1165475 journal
+ 476587 citation
+ 98280 pressrelease
+ 62235 report
+ 60868 encyclopedia
+ 27680 map
+ 18742 AV-media
+ 15317 thesis
+ 11236 conference
+ 10666 episode
+ 6609 interview
+ 5100 sports-reference
+ 3328 podcast
+ 2431 AV-media-notes
+ 1446 mailinglist
+ 1361 techreport
+ 1039 speech
+ 821 newsgroup
+ 504 sign
+ 138 serial
+ 24 harvnb
+ 21 DVD-notes
+ 15 gazette
+ 2 gnis