aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py20
1 files changed, 16 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index c6d3829..26615df 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -1,13 +1,25 @@
"""
Clustering part of matching.
-We want to have generic and fast way to derive various clusters. Input is a
-json lines of release entities, e.g. from a database dump.
+We want to have generic and fast way to derive various clusters. Input is json
+lines of release entities, e.g. from a database dump.
Map and reduce.
* input (json) blob -> (ident, value) -> group by value -> emit idents per group
+Example output:
+
+ {
+ "v": [
+ "7uvh4z6zsjcptia5ig6swu4fre",
+ "chlthrumyfg23aqw4r477j3vge",
+ "yuo4smv4bzefdjsudbbzka3qv4"
+ ],
+ "k": "124-5_0137.dcm",
+ "c": "t"
+ }
+
"""
import argparse
@@ -107,7 +119,7 @@ def cluster_by_title_normalized(args):
print("%s\t%s" % (id, title), file=tf)
sbc = sort_by_column(tf.name, opts="-k 2")
- for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="tn"):
print(json.dumps(doc).decode("utf-8"))
os.remove(sbc)
@@ -133,7 +145,7 @@ def cluster_by_title_nysiis(args):
print("%s\t%s" % (id, title), file=tf)
sbc = sort_by_column(tf.name, opts="-k 2")
- for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="t"):
+ for doc in group_by_column(sbc, key=cut(f=1), value=cut(f=0), comment="nysiis"):
print(json.dumps(doc).decode("utf-8"))
os.remove(sbc)