diff options
author | Martin Czygan <martin@archive.org> | 2021-12-18 00:18:46 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-12-18 00:18:46 +0000 |
commit | 3c0cae2b81dbd4ff7621cf9b7e4a6183352984f0 (patch) | |
tree | d6289ab22ead66eb28190ff07e00af6ab0f35306 /extra/wikipedia/stats_enwiki_20210801.txt | |
parent | 3867fcab91244650a1e2fd9bba165a54c4e810e5 (diff) | |
parent | fa557a90482cfed59564173e442d9375b959ee8b (diff) | |
download | refcat-3c0cae2b81dbd4ff7621cf9b7e4a6183352984f0.tar.gz refcat-3c0cae2b81dbd4ff7621cf9b7e4a6183352984f0.zip |
Merge branch 'bnewbold-wikipedia-notes' into 'master'
wikipedia refs prep notes, and stats from 20210801 run
See merge request webgroup/refcat!5
Diffstat (limited to 'extra/wikipedia/stats_enwiki_20210801.txt')
-rw-r--r-- | extra/wikipedia/stats_enwiki_20210801.txt | 104 |
1 files changed, 104 insertions, 0 deletions
diff --git a/extra/wikipedia/stats_enwiki_20210801.txt b/extra/wikipedia/stats_enwiki_20210801.txt new file mode 100644 index 0000000..9acfc83 --- /dev/null +++ b/extra/wikipedia/stats_enwiki_20210801.txt @@ -0,0 +1,104 @@ + +export CITEFILE=enwiki-20210801-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l +=> 6,348,910 + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l +=> 4,255,940 + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l +=> 36,057,931 + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + 21257548 web + 6162933 news + 3984831 book + 2620278 journal + 756082 citation + 379975 harvnb + 105212 gazette + 99427 pressrelease + 84036 nhle + 78761 encyclopedia + 71774 gnis + 70308 nrisref + 67731 report + 48090 episode + 43060 geonet3 + 41776 map + 40904 AV-media-notes + 40140 season + 28051 AV-media + 22182 thesis + 17891 conference + 10420 england + 7798 interview + 5100 sports-reference + 3332 podcast + 2557 arxiv + 1859 techreport + 1455 mailinglist + 1284 speech + 860 newsgroup + 837 sign + 657 serial + 567 DVD-notes + 215 policy + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + 2825707 ISBN + 1859682 DOI + 980996 PMID + 520281 ISSN + 342543 PMC + 294828 BIBCODE + 261269 OCLC + 157287 JSTOR + 82960 ARXIV + 23376 MR + 20723 LCCN + 9518 ASIN + 7520 OL + 4845 ZBL + 4447 SSRN + 954 OSTI + 666 JFM + 425 USENETID + 151 ISMN + 95 RFC + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 21169067 web + 5743802 news + 1958773 book + 1165475 journal + 476587 citation + 98280 pressrelease + 62235 report + 60868 encyclopedia + 27680 map + 18742 AV-media + 15317 thesis + 11236 conference + 10666 episode + 6609 interview + 5100 sports-reference + 3328 podcast + 2431 AV-media-notes + 1446 mailinglist + 1361 techreport + 1039 speech + 821 newsgroup + 504 sign + 138 serial + 24 harvnb + 21 DVD-notes + 15 gazette + 2 gnis |