update docs

author: Martin Czygan <martin.czygan@gmail.com> 2020-10-22 00:40:30 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2020-10-22 00:40:30 +0200
commit: 8d8ded6d303b653caaa9718cbfb3aa8ef637823e (patch)
tree: 4a7936b897ce98e5eb5f928eb9ea80df03472c1e
parent: 7bd81f80291e7b4458f585b4171a2ae5c660f7e6 (diff)
download: fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.tar.gz
fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.zip
1 files changed, 16 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index c6d3829..26615df 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -1,13 +1,25 @@
 """
 Clustering part of matching.
 
-We want to have generic and fast way to derive various clusters. Input is a
-json lines of release entities, e.g. from a database dump.
+We want to have generic and fast way to derive various clusters. Input is json
+lines of release entities, e.g. from a database dump.
 
 Map and reduce.
 
 * input (json) blob -> (ident, value) -> group by value -> emit idents per group
 
+Example output:
+
+    {
+      "v": [
+	"7uvh4z6zsjcptia5ig6swu4fre",
+	"chlthrumyfg23aqw4r477j3vge",
+	"yuo4smv4bzefdjsudbbzka3qv4"
+      ],
+      "k": "124-5_0137.dcm",
+      "c": "t"
+    }
+
 """
 
 import argparse
@@ -107,7 +119,7 @@ def cluster_by_title_normalized(args):
             print("%s\t%s" % (id, title), file=tf)
 
     sbc = sort_by_column(tf.name, opts="-k 2")
-    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="tn"):
         print(json.dumps(doc).decode("utf-8"))
 
     os.remove(sbc)
@@ -133,7 +145,7 @@ def cluster_by_title_nysiis(args):
             print("%s\t%s" % (id, title), file=tf)
 
     sbc = sort_by_column(tf.name, opts="-k 2")
-    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+    for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="nysiis"):
         print(json.dumps(doc).decode("utf-8"))
 
     os.remove(sbc)
author	Martin Czygan <martin.czygan@gmail.com>	2020-10-22 00:40:30 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2020-10-22 00:40:30 +0200
commit	8d8ded6d303b653caaa9718cbfb3aa8ef637823e (patch)
tree	4a7936b897ce98e5eb5f928eb9ea80df03472c1e
parent	7bd81f80291e7b4458f585b4171a2ae5c660f7e6 (diff)
download	fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.tar.gz fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.zip