diff options
-rw-r--r-- | extra/wikipedia/README.md | 61 | ||||
-rw-r--r-- | extra/wikipedia/stats_enwiki_20210801.txt | 104 | ||||
-rw-r--r-- | extra/wikipedia/stats_enwiki_20211201.txt | 168 | ||||
-rw-r--r-- | extra/wikipedia/stats_template.txt | 23 |
4 files changed, 356 insertions, 0 deletions
diff --git a/extra/wikipedia/README.md b/extra/wikipedia/README.md new file mode 100644 index 0000000..59480a7 --- /dev/null +++ b/extra/wikipedia/README.md @@ -0,0 +1,61 @@ + +This document describes how to parse references out of Wikipedia bulk XML +dumps, using the `wikiciteparser` python package, for use in the refcat +citation matching pipeline. + +Unfortunately, due to limitations in `wikiciteparser` (and the complexity of +Wikipedia citation formatting across language instances), this pipeline only +works with the English version of Wikipedia (enwiki). + + +## Download Bulk XML Snapshot + +You can find documentation and links to recent snapshots at +<https://dumps.wikimedia.org/backup-index.html>. We want the +`-pages-articles.xml.bz2` files, which includes article text for the most +recent version of articles. If we download the set of smaller individual files, +instead of the single combined file, we can parallelize processing later. + +A hack-y way to download all the files is to copy/paste the list of URLs from +the web listing, put them in a file called `urls.txt`, then run a command like: + + cat urls.txt | parallel -j2 wget --quiet -c https://dumps.wikimedia.org/enwiki/20211201/{} + + +## Install `wikiciteparser` + +To use the official/released version, in a virtualenv (or similar), run: + + pip install wikiciteparser + +Or, do a git checkout of <https://github.com/dissemin/wikiciteparser>. + + +## Run Parallel Command + +Within a virtualenv, use `parallel` to process like: + + ls /fast/download/enwiki-20211201-pages-articles/enwiki*.bz2 \ + | parallel -j12 --line-buffer python3 -m wikiciteparser.bulk {} \ + | pv -l \ + | gzip \ + > enwiki-20211201-pages-articles.citations.json.gz + +This will output JSON lines, one line per article, with the article title, +revision, site name, and any extracted references in a sub-array (of JSON +objects). As of December 2021, it takes about 17 hours on a large machine, with +the above command. + +## Prior Work + +Similar projects include: + +* [Harshdeep1996/cite-classifications-wiki](https://github.com/Harshdeep1996/cite-classifications-wiki): + uses `wikiciteparser` and PySpark to extract references from bulk XML, + outputs parquet. Requires a Spark cluster/environment to run. (itself used by + [Wikipedia Citations in Wikidata](https://github.com/opencitations/wcw)) +* [python-mwcites](https://github.com/mediawiki-utilities/python-mwcites): uses + `python-mwxml` to iterate over bulk XML, has relatively simple identifier + extraction +* [gesiscss/wikipedia_references](https://github.com/gesiscss/wikipedia_references): + oriented towards tracking edits to individual references over time diff --git a/extra/wikipedia/stats_enwiki_20210801.txt b/extra/wikipedia/stats_enwiki_20210801.txt new file mode 100644 index 0000000..9acfc83 --- /dev/null +++ b/extra/wikipedia/stats_enwiki_20210801.txt @@ -0,0 +1,104 @@ + +export CITEFILE=enwiki-20210801-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l +=> 6,348,910 + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l +=> 4,255,940 + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l +=> 36,057,931 + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + 21257548 web + 6162933 news + 3984831 book + 2620278 journal + 756082 citation + 379975 harvnb + 105212 gazette + 99427 pressrelease + 84036 nhle + 78761 encyclopedia + 71774 gnis + 70308 nrisref + 67731 report + 48090 episode + 43060 geonet3 + 41776 map + 40904 AV-media-notes + 40140 season + 28051 AV-media + 22182 thesis + 17891 conference + 10420 england + 7798 interview + 5100 sports-reference + 3332 podcast + 2557 arxiv + 1859 techreport + 1455 mailinglist + 1284 speech + 860 newsgroup + 837 sign + 657 serial + 567 DVD-notes + 215 policy + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + 2825707 ISBN + 1859682 DOI + 980996 PMID + 520281 ISSN + 342543 PMC + 294828 BIBCODE + 261269 OCLC + 157287 JSTOR + 82960 ARXIV + 23376 MR + 20723 LCCN + 9518 ASIN + 7520 OL + 4845 ZBL + 4447 SSRN + 954 OSTI + 666 JFM + 425 USENETID + 151 ISMN + 95 RFC + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 21169067 web + 5743802 news + 1958773 book + 1165475 journal + 476587 citation + 98280 pressrelease + 62235 report + 60868 encyclopedia + 27680 map + 18742 AV-media + 15317 thesis + 11236 conference + 10666 episode + 6609 interview + 5100 sports-reference + 3328 podcast + 2431 AV-media-notes + 1446 mailinglist + 1361 techreport + 1039 speech + 821 newsgroup + 504 sign + 138 serial + 24 harvnb + 21 DVD-notes + 15 gazette + 2 gnis diff --git a/extra/wikipedia/stats_enwiki_20211201.txt b/extra/wikipedia/stats_enwiki_20211201.txt new file mode 100644 index 0000000..0553979 --- /dev/null +++ b/extra/wikipedia/stats_enwiki_20211201.txt @@ -0,0 +1,168 @@ + +export CITEFILE=enwiki-20211201-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l +=> 6,416,542 + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l +=> 4,389,394 + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l +=> 37,724,219 + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + 22379651 web + 6430930 news + 4124318 book + 2699528 journal + 770319 citation + 388893 harvnb + 106646 gazette + 102752 pressrelease + 86908 nhle + 82366 encyclopedia + 72401 gnis + 71358 report + 70419 nrisref + 49233 episode + 43692 AV-media-notes + 43058 geonet3 + 42300 map + 41957 season + 36349 AV-media + 24498 thesis + 18723 conference + 10591 england + 8283 interview + 5084 sports-reference + 3567 podcast + 2528 arxiv + 1915 techreport + 1466 mailinglist + 1325 speech + 869 newsgroup + 864 sign + 647 serial + 566 DVD-notes + 215 policy + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + 2925108 ISBN + 1932505 DOI + 1010556 PMID + 557313 ISSN + 356460 PMC + 320414 BIBCODE + 275283 OCLC + 166861 JSTOR + 85286 ARXIV + 23600 MR + 21797 LCCN + 9927 ASIN + 7814 OL + 4878 ZBL + 4556 SSRN + 1071 OSTI + 672 JFM + 424 USENETID + 152 ISMN + 86 RFC + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 22291836 web + 5995008 news + 2033769 book + 1217383 journal + 487205 citation + 101594 pressrelease + 65505 report + 63903 encyclopedia + 28179 map + 26538 AV-media + 17167 thesis + 11806 conference + 10924 episode + 7042 interview + 5084 sports-reference + 3562 podcast + 2484 AV-media-notes + 1458 mailinglist + 1416 techreport + 1072 speech + 831 newsgroup + 515 sign + 139 serial + 24 harvnb + 22 DVD-notes + 15 gazette + +# refs with ArchiveURL, by type +zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 4738008 web + 1089407 news + 111550 journal + 79472 book + 76758 citation + 33111 pressrelease + 12318 report + 8019 encyclopedia + 6061 AV-media + 5068 map + 4887 sports-reference + 2716 conference + 2138 interview + 1915 episode + 1716 thesis + 578 podcast + 400 AV-media-notes + 308 speech + 257 mailinglist + 244 techreport + 149 newsgroup + 32 sign + 10 serial + 7 DVD-notes + 4 nhle + 3 england + 2 gazette + 1 gnis + +# top ref URL hosts (not domains, but leading 'www' removed) +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .URL' -r | cut -f3 -d/ | sed 's/^www\.//g' | sort | uniq -c | sort -nr | head -n30 + 1266572 books.google.com + 552597 archive.org + 372440 nytimes.com + 361046 ncbi.nlm.nih.gov + 275580 bbc.co.uk + 261514 newspapers.com + 245562 theguardian.com + 214278 billboard.com + 197563 youtube.com + 190427 news.bbc.co.uk + 160735 census.gov + 143304 news.google.com + 125024 allmusic.com + 110937 nla.gov.au + 105348 washingtonpost.com + 93509 telegraph.co.uk + 85782 bbc.com + 82966 espncricinfo.com + 82789 timesofindia.indiatimes.com + 81518 variety.com + 77786 imdb.com + 76921 independent.co.uk + 73717 baseball-reference.com + 72244 deadline.com + 67088 animenewsnetwork.com + 64784 sports-reference.com + 63994 reuters.com + 63394 hollywoodreporter.com + 60720 thehindu.com + 58972 tvbythenumbers.zap2it.com + [...] diff --git a/extra/wikipedia/stats_template.txt b/extra/wikipedia/stats_template.txt new file mode 100644 index 0000000..07f9745 --- /dev/null +++ b/extra/wikipedia/stats_template.txt @@ -0,0 +1,23 @@ + +export CITEFILE=enwiki-YYYYMMDD-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + +# refs with ArchiveURL, by type +zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr |