update README

author: Martin Czygan <martin.czygan@gmail.com> 2020-12-23 00:51:22 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-12-23 00:51:22 +0100
commit: a64d1a6822838e5e30d9be602847fd76d0fe7c15 (patch)
tree: e929d0c8db64c9569d8509cf2e5f18e35437fae8
parent: 85189c2f87e6c198069ec250818df029c58c4a3d (diff)
download: fuzzycat-a64d1a6822838e5e30d9be602847fd76d0fe7c15.tar.gz
fuzzycat-a64d1a6822838e5e30d9be602847fd76d0fe7c15.zip
1 files changed, 5 insertions, 29 deletions
diff --git a/README.md b/README.md
index 8547d45..a7a6df1 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Clustering works in a three step process:
 
 ### Verification
 
-Run verification.
+Run verification (pairwise *double-check* of match candidates in a cluster).
 
 ```
 $ time zstdcat -T0 sample_cluster.json.zst | python -m fuzzycat verify > sample_verify.txt
@@ -49,37 +49,13 @@ user    8m50.703s
 sys     0m29.262s
 ```
 
+This is a one-pass operation. For processing 150M docs, we very much depend on
+the documents being on disk in a file (we keep the complete document in a
+clustering doc).
 
 Example results over 10M docs:
 
-```json
-{
-  "miss.appendix": 176,
-  "miss.blacklisted": 12124,
-  "miss.blacklisted_fragment": 9,
-  "miss.book_chapter": 46733,
-  "miss.component": 2173,
-  "miss.contrib_intersection_empty": 73592,
-  "miss.dataset_doi": 30806,
-  "miss.num_diff": 1,
-  "miss.release_type": 19767,
-  "miss.short_title": 16737,
-  "miss.subtitle": 11975,
-  "miss.title_filename": 87,
-  "miss.year": 123288,
-  "ok.arxiv_version": 90726,
-  "ok.dummy": 106196,
-  "ok.preprint_published": 10495,
-  "ok.slug_title_author_match": 47285,
-  "ok.title_author_match": 65685,
-  "ok.tokenized_authors": 7592,
-  "skip.container_name_blacklist": 20,
-  "skip.publisher_blacklist": 456,
-  "skip.too_large": 7430,
-  "skip.unique": 8808462,
-  "total": 9481815
-}
-```
+
 
 # A full run
author	Martin Czygan <martin.czygan@gmail.com>	2020-12-23 00:51:22 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-12-23 00:51:22 +0100
commit	a64d1a6822838e5e30d9be602847fd76d0fe7c15 (patch)
tree	e929d0c8db64c9569d8509cf2e5f18e35437fae8
parent	85189c2f87e6c198069ec250818df029c58c4a3d (diff)
download	fuzzycat-a64d1a6822838e5e30d9be602847fd76d0fe7c15.tar.gz fuzzycat-a64d1a6822838e5e30d9be602847fd76d0fe7c15.zip