diff options
Diffstat (limited to 'extra/wikipedia/stats_template.txt')
-rw-r--r-- | extra/wikipedia/stats_template.txt | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/extra/wikipedia/stats_template.txt b/extra/wikipedia/stats_template.txt new file mode 100644 index 0000000..fda994f --- /dev/null +++ b/extra/wikipedia/stats_template.txt @@ -0,0 +1,23 @@ + +export CITEFILE=enwiki-YYYYMMDD-pages-articles.citations.json.gz + +# total number of articles processed +zcat $CITEFILE | wc -l + +# articles with one or more refs +zcat $CITEFILE | rg '"CitationClass"' | wc -l + +# total number of refs +zcat $CITEFILE | jq '.refs[].CitationClass' -r | wc -l + +# refs by type +zcat $CITEFILE | jq '.refs[].CitationClass' -r | sort | uniq -c | sort -nr + +# identifiers present +zcat $CITEFILE | jq '.refs[] | select(.ID_list != null) | .ID_list | keys[]' -r | sort | uniq -c | sort -nr + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.URL != null) | .CitationClass' -r | sort | uniq -c | sort -nr + +# refs with URL, by type +zcat $CITEFILE | jq '.refs[] | select(.ArchiveURL != null) | .CitationClass' -r | sort | uniq -c | sort -nr |