diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-12-15 15:54:55 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-12-15 15:54:55 -0800 |
commit | 1d6589ed58879206c4507d08b25ab09e859d34ee (patch) | |
tree | 216504a1c80c6f68a0a2881d10252267f6cab730 | |
parent | 5df39489600b8910d356cde9aece09381edf547f (diff) | |
download | refcat-1d6589ed58879206c4507d08b25ab09e859d34ee.tar.gz refcat-1d6589ed58879206c4507d08b25ab09e859d34ee.zip |
wikipedia refs prep notes, and stats from 20210801 run
-rw-r--r-- | extra/wikipedia/README.md | 60 | ||||
-rw-r--r-- | extra/wikipedia/stats_enwiki_20210801.txt | 104 | ||||
-rw-r--r-- | extra/wikipedia/stats_template.txt | 23 |
3 files changed, 187 insertions, 0 deletions
diff --git a/extra/wikipedia/README.md b/extra/wikipedia/README.md new file mode 100644 index 0000000..8cfdfc0 --- /dev/null +++ b/extra/wikipedia/README.md @@ -0,0 +1,60 @@ + +This document describes how to parse references out of Wikipedia bulk XML +dumps, using the `wikiciteparser` python package, for use in the refcat +citation matching pipeline. + +Unfortunately, due to limitations in `wikiciteparser` (and the complexity of +Wikipedia citation formatting across language instances), this pipeline only +works with the English version of Wikipedia (enwiki). + + +## Download Bulk XML Snapshot + +You can find documentation and links to recent snapshots at +<https://dumps.wikimedia.org/backup-index.html>. We want the +`-pages-articles.xml.bz2` files, which includes article text for the most +recent version of articles. If we download the set of smaller individual files, +instead of the single combined file, we can parallelize processing later. + +A hack-y way to download all the files is to copy/paste the list of URLs from +the web listing, put them in a file called `urls.txt`, then run a command like: + + cat urls.txt | parallel -j2 wget --quiet -c https://dumps.wikimedia.org/enwiki/20211201/{} + + +## Install `wikiciteparser` + +To use the official/released version, in a virtualenv (or similar), run: + + pip install wikiciteparser + +Or, do a git checkout of <https://github.com/dissemin/wikiciteparser>. + + +## Run Parallel Command + +Within a virtualenv, use `parallel` to process like: + + ls /fast/download/enwiki-20211201-pages-articles/enwiki*.bz2 \ + | parallel -j12 --line-buffer python3 -m wikiciteparser.bulk {} \ + | pv -l \ + | gzip \ + > enwiki-20211201-pages-articles.citations.json.gz + +This will output JSON lines, one line per article, with the article title, +revision, site name, and any extracted references in a sub-array (of JSON +objects). + +## Prior Work + +Similar projects include: + +* [Harshdeep1996/cite-classifications-wiki](https://github.com/Harshdeep1996/cite-classifications-wiki): + uses `wikiciteparser` and PySpark to extract references from bulk XML, + outputs parquet. Requires a Spark cluster/environment to run. (itself used by + [Wikipedia Citations in Wikidata](https://github.com/opencitations/wcw)) +* [python-mwcites](https://github.com/mediawiki-utilities/python-mwcites): uses + `python-mwxml` to iterate over bulk XML, has relatively simple identifier + extraction +* [gesiscss/wikipedia_references](https://github.com/gesiscss/wikipedia_references): + oriented towards tracking edits to individual references over time diff --git a/extra/wikipedia/stats_enwiki_20210801.txt b/extra/wikipedia/stats_enwiki_20210801.txt new file mode 100644 index 0000000..9acfc83 --- /dev/null +++ b/extra/wikipedia/stats_enwiki_20210801.txt @@ -0,0 +1,104 @@ + +export CITEFILE=enwiki-20210801-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l +=> 6,348,910 + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l +=> 4,255,940 + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l +=> 36,057,931 + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + 21257548 web + 6162933 news + 3984831 book + 2620278 journal + 756082 citation + 379975 harvnb + 105212 gazette + 99427 pressrelease + 84036 nhle + 78761 encyclopedia + 71774 gnis + 70308 nrisref + 67731 report + 48090 episode + 43060 geonet3 + 41776 map + 40904 AV-media-notes + 40140 season + 28051 AV-media + 22182 thesis + 17891 conference + 10420 england + 7798 interview + 5100 sports-reference + 3332 podcast + 2557 arxiv + 1859 techreport + 1455 mailinglist + 1284 speech + 860 newsgroup + 837 sign + 657 serial + 567 DVD-notes + 215 policy + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + 2825707 ISBN + 1859682 DOI + 980996 PMID + 520281 ISSN + 342543 PMC + 294828 BIBCODE + 261269 OCLC + 157287 JSTOR + 82960 ARXIV + 23376 MR + 20723 LCCN + 9518 ASIN + 7520 OL + 4845 ZBL + 4447 SSRN + 954 OSTI + 666 JFM + 425 USENETID + 151 ISMN + 95 RFC + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + 21169067 web + 5743802 news + 1958773 book + 1165475 journal + 476587 citation + 98280 pressrelease + 62235 report + 60868 encyclopedia + 27680 map + 18742 AV-media + 15317 thesis + 11236 conference + 10666 episode + 6609 interview + 5100 sports-reference + 3328 podcast + 2431 AV-media-notes + 1446 mailinglist + 1361 techreport + 1039 speech + 821 newsgroup + 504 sign + 138 serial + 24 harvnb + 21 DVD-notes + 15 gazette + 2 gnis diff --git a/extra/wikipedia/stats_template.txt b/extra/wikipedia/stats_template.txt new file mode 100644 index 0000000..fda994f --- /dev/null +++ b/extra/wikipedia/stats_template.txt @@ -0,0 +1,23 @@ + +export CITEFILE=enwiki-YYYYMMDD-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr |