aboutsummaryrefslogtreecommitdiffstats
path: root/extra/wikipedia/stats_template.txt
diff options
context:
space:
mode:
Diffstat (limited to 'extra/wikipedia/stats_template.txt')
-rw-r--r--extra/wikipedia/stats_template.txt23
1 files changed, 23 insertions, 0 deletions
diff --git a/extra/wikipedia/stats_template.txt b/extra/wikipedia/stats_template.txt
new file mode 100644
index 0000000..fda994f
--- /dev/null
+++ b/extra/wikipedia/stats_template.txt
@@ -0,0 +1,23 @@
+
+export CITEFILE=enwiki-YYYYMMDD-pages-articles.citations.json.gz
+
+# total number of articles processed
+zcat $CITEFILE | wc -l
+
+# articles with one or more refs
+zcat $CITEFILE | rg '"CitationClass"' | wc -l
+
+# total number of refs
+zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l
+
+# refs by type
+zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr
+
+# identifiers present
+zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr
+
+# refs with URL, by type
+zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr
+
+# refs with URL, by type
+zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr