From 9a66d7c4896f3415816d2df97bba9a01ac0ebf0c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 14 Apr 2021 00:24:46 +0200 Subject: update notes --- python/notes/version_3.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'python') diff --git a/python/notes/version_3.md b/python/notes/version_3.md index 71d4dd1..891b61c 100644 --- a/python/notes/version_3.md +++ b/python/notes/version_3.md @@ -208,3 +208,25 @@ $ time zstdcat -T0 /magna/refcat/UnmatchedRefs/date-2021-02-20.json.zst | LC_ALL A first run only got 64008 docs; improbable that we are missing so many doi. Also, need to generalize some skate code a bit. + +---- + +# Verification stats + +* have 40257623 clusters, `zstdcat -T0 /magna/refcat/RefsFatcatClusters/date-2021-02-20.json.zst | wc -l` +* have X cluster of size less than 10 + +``` +$ zstdcat -T0 /magna/refcat/RefsFatcatClusters/date-2021-02-20.json.zst | + jq -rc 'select(.v|length < 10)' | LC_ALL=C wc -l +``` + +A 5M sample. + +``` +$ awk '{print $3}' cluster_verify_5m.txt | sort | uniq -c | sort -nr +6886124 StatusDifferent +4619805 StatusStrong +3587478 StatusExact + 120215 StatusAmbiguous +``` -- cgit v1.2.3