diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-10-22 00:40:30 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-10-22 00:40:30 +0200 |
commit | 8d8ded6d303b653caaa9718cbfb3aa8ef637823e (patch) | |
tree | 4a7936b897ce98e5eb5f928eb9ea80df03472c1e | |
parent | 7bd81f80291e7b4458f585b4171a2ae5c660f7e6 (diff) | |
download | fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.tar.gz fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.zip |
update docs
-rw-r--r-- | fuzzycat/cluster.py | 20 |
1 files changed, 16 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index c6d3829..26615df 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -1,13 +1,25 @@ """ Clustering part of matching. -We want to have generic and fast way to derive various clusters. Input is a -json lines of release entities, e.g. from a database dump. +We want to have generic and fast way to derive various clusters. Input is json +lines of release entities, e.g. from a database dump. Map and reduce. * input (json) blob -> (ident, value) -> group by value -> emit idents per group +Example output: + + { + "v": [ + "7uvh4z6zsjcptia5ig6swu4fre", + "chlthrumyfg23aqw4r477j3vge", + "yuo4smv4bzefdjsudbbzka3qv4" + ], + "k": "124-5_0137.dcm", + "c": "t" + } + """ import argparse @@ -107,7 +119,7 @@ def cluster_by_title_normalized(args): print("%s\t%s" % (id, title), file=tf) sbc = sort_by_column(tf.name, opts="-k 2") - for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"): + for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="tn"): print(json.dumps(doc).decode("utf-8")) os.remove(sbc) @@ -133,7 +145,7 @@ def cluster_by_title_nysiis(args): print("%s\t%s" % (id, title), file=tf) sbc = sort_by_column(tf.name, opts="-k 2") - for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"): + for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="nysiis"): print(json.dumps(doc).decode("utf-8")) os.remove(sbc) |