diff options
-rw-r--r-- | extra/wikipedia/README.md | 3 | ||||
-rw-r--r-- | extra/wikipedia/stats_enwiki_20211201.txt | 168 | ||||
-rw-r--r-- | extra/wikipedia/stats_template.txt | 2 |
3 files changed, 171 insertions, 2 deletions
diff --git a/extra/wikipedia/README.md b/extra/wikipedia/README.md index 8cfdfc0..59480a7 100644 --- a/extra/wikipedia/README.md +++ b/extra/wikipedia/README.md @@ -43,7 +43,8 @@ Within a virtualenv, use `parallel` to process like: This will output JSON lines, one line per article, with the article title, revision, site name, and any extracted references in a sub-array (of JSON -objects). +objects). As of December 2021, it takes about 17 hours on a large machine, with +the above command. ## Prior Work diff --git a/extra/wikipedia/stats_enwiki_20211201.txt b/extra/wikipedia/stats_enwiki_20211201.txt new file mode 100644 index 0000000..0553979 --- /dev/null +++ b/extra/wikipedia/stats_enwiki_20211201.txt @@ -0,0 +1,168 @@ + +export CITEFILE=enwiki-20211201-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l +=> 6,416,542 + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l +=> 4,389,394 + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l +=> 37,724,219 + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + 22379651 web + 6430930 news + 4124318 book + 2699528 journal + 770319 citation + 388893 harvnb + 106646 gazette + 102752 pressrelease + 86908 nhle + 82366 encyclopedia + 72401 gnis + 71358 report + 70419 nrisref + 49233 episode + 43692 AV-media-notes + 43058 geonet3 + 42300 map + 41957 season + 36349 AV-media + 24498 thesis + 18723 conference + 10591 england + 8283 interview + 5084 sports-reference + 3567 podcast + 2528 arxiv + 1915 techreport + 1466 mailinglist + 1325 speech + 869 newsgroup + 864 sign + 647 serial + 566 DVD-notes + 215 policy + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + 2925108 ISBN + 1932505 DOI + 1010556 PMID + 557313 ISSN + 356460 PMC + 320414 BIBCODE + 275283 OCLC + 166861 JSTOR + 85286 ARXIV + 23600 MR + 21797 LCCN + 9927 ASIN + 7814 OL + 4878 ZBL + 4556 SSRN + 1071 OSTI + 672 JFM + 424 USENETID + 152 ISMN + 86 RFC + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 22291836 web + 5995008 news + 2033769 book + 1217383 journal + 487205 citation + 101594 pressrelease + 65505 report + 63903 encyclopedia + 28179 map + 26538 AV-media + 17167 thesis + 11806 conference + 10924 episode + 7042 interview + 5084 sports-reference + 3562 podcast + 2484 AV-media-notes + 1458 mailinglist + 1416 techreport + 1072 speech + 831 newsgroup + 515 sign + 139 serial + 24 harvnb + 22 DVD-notes + 15 gazette + +# refs with ArchiveURL, by type +zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 4738008 web + 1089407 news + 111550 journal + 79472 book + 76758 citation + 33111 pressrelease + 12318 report + 8019 encyclopedia + 6061 AV-media + 5068 map + 4887 sports-reference + 2716 conference + 2138 interview + 1915 episode + 1716 thesis + 578 podcast + 400 AV-media-notes + 308 speech + 257 mailinglist + 244 techreport + 149 newsgroup + 32 sign + 10 serial + 7 DVD-notes + 4 nhle + 3 england + 2 gazette + 1 gnis + +# top ref URL hosts (not domains, but leading 'www' removed) +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .URL' -r | cut -f3 -d/ | sed 's/^www\.//g' | sort | uniq -c | sort -nr | head -n30 + 1266572 books.google.com + 552597 archive.org + 372440 nytimes.com + 361046 ncbi.nlm.nih.gov + 275580 bbc.co.uk + 261514 newspapers.com + 245562 theguardian.com + 214278 billboard.com + 197563 youtube.com + 190427 news.bbc.co.uk + 160735 census.gov + 143304 news.google.com + 125024 allmusic.com + 110937 nla.gov.au + 105348 washingtonpost.com + 93509 telegraph.co.uk + 85782 bbc.com + 82966 espncricinfo.com + 82789 timesofindia.indiatimes.com + 81518 variety.com + 77786 imdb.com + 76921 independent.co.uk + 73717 baseball-reference.com + 72244 deadline.com + 67088 animenewsnetwork.com + 64784 sports-reference.com + 63994 reuters.com + 63394 hollywoodreporter.com + 60720 thehindu.com + 58972 tvbythenumbers.zap2it.com + [...] diff --git a/extra/wikipedia/stats_template.txt b/extra/wikipedia/stats_template.txt index fda994f..07f9745 100644 --- a/extra/wikipedia/stats_template.txt +++ b/extra/wikipedia/stats_template.txt @@ -19,5 +19,5 @@ zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r # refs with URL, by type zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr -# refs with URL, by type +# refs with ArchiveURL, by type zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr |