aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-10-22 00:40:30 +0200
committerMartin Czygan <martin.czygan@gmail.com>2020-10-22 00:40:30 +0200
commit8d8ded6d303b653caaa9718cbfb3aa8ef637823e (patch)
tree4a7936b897ce98e5eb5f928eb9ea80df03472c1e
parent7bd81f80291e7b4458f585b4171a2ae5c660f7e6 (diff)
downloadfuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.tar.gz
fuzzycat-8d8ded6d303b653caaa9718cbfb3aa8ef637823e.zip
update docs
-rw-r--r--fuzzycat/cluster.py20
1 files changed, 16 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index c6d3829..26615df 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -1,13 +1,25 @@
"""
Clustering part of matching.
-We want to have generic and fast way to derive various clusters. Input is a
-json lines of release entities, e.g. from a database dump.
+We want to have generic and fast way to derive various clusters. Input is json
+lines of release entities, e.g. from a database dump.
Map and reduce.
* input (json) blob -> (ident, value) -> group by value -> emit idents per group
+Example output:
+
+ {
+ "v": [
+ "7uvh4z6zsjcptia5ig6swu4fre",
+ "chlthrumyfg23aqw4r477j3vge",
+ "yuo4smv4bzefdjsudbbzka3qv4"
+ ],
+ "k": "124-5_0137.dcm",
+ "c": "t"
+ }
+
"""
import argparse
@@ -107,7 +119,7 @@ def cluster_by_title_normalized(args):
print("%s\t%s" % (id, title), file=tf)
sbc = sort_by_column(tf.name, opts="-k 2")
- for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="tn"):
print(json.dumps(doc).decode("utf-8"))
os.remove(sbc)
@@ -133,7 +145,7 @@ def cluster_by_title_nysiis(args):
print("%s\t%s" % (id, title), file=tf)
sbc = sort_by_column(tf.name, opts="-k 2")
- for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="nysiis"):
print(json.dumps(doc).decode("utf-8"))
os.remove(sbc)