From 9d707a0203ac3aaf17e266a0f5a934b5f9e2dbbf Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 9 Dec 2020 22:59:47 +0100 Subject: update README --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f90c80..1c41f4e 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Single threaded, 42h. ``` $ time zstdcat -T0 release_export_expanded.json.zst | \ - TMPDIR=/bigger/tmp python -m fuzzycat cluster --tmpdir /bigger/tmp -t tsandcrawler \ | + TMPDIR=/bigger/tmp python -m fuzzycat cluster --tmpdir /bigger/tmp -t tsandcrawler | \ zstd -c9 > cluster_tsandcrawler.json.zst { "key_fail": 0, @@ -82,6 +82,19 @@ sys 118m38.141s So, 29881072 (about 20%) docs in the potentially duplicated set. +Verification (about 15h): + +``` +$ time zstdcat -T0 cluster_tsandcrawler.json.zst | python -m fuzzycat verify | \ + zstd -c9 > cluster_tsandcrawler_verified_3c7378.tsv.zst + +... + +real 927m28.631s +user 939m32.761s +sys 36m47.602s +``` + # Use cases -- cgit v1.2.3