From 5962f27aced29bc40efe82113fe057e302b68fa4 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 26 Nov 2020 21:13:08 +0100 Subject: add --min-cluster-size flag to cluster subcommand --- fuzzycat/__main__.py | 4 ++++ fuzzycat/cluster.py | 4 ++++ tests/test_cluster.py | 1 + 3 files changed, 9 insertions(+) diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py index 3a3b3ba..70ed1ab 100644 --- a/fuzzycat/__main__.py +++ b/fuzzycat/__main__.py @@ -77,6 +77,10 @@ if __name__ == '__main__': sub_cluster.set_defaults(func=run_cluster) sub_cluster.add_argument('-f', '--files', default="-", help='input files') sub_cluster.add_argument('--key-denylist', help='file path to key denylist') + sub_cluster.add_argument('--min-cluster-size', + default=2, + type=int, + help='ignore smaller clusters') sub_cluster.add_argument('-t', '--type', default='title', diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 7843577..10bb431 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -311,6 +311,7 @@ class Cluster: prefix: str = "fuzzycat-", tmpdir: str = tempfile.gettempdir(), strict: bool = False, + min_cluster_size: int = 2, max_cluster_size: int = 100, verbose=True): self.iterable: collections.abc.Iterable = iterable @@ -320,6 +321,7 @@ class Cluster: self.tmpdir: str = tmpdir self.strict = strict self.key_denylist = key_denylist + self.min_cluster_size = min_cluster_size self.max_cluster_size = max_cluster_size self.verbose = verbose self.counter: Dict[str, int] = collections.Counter({ @@ -361,6 +363,8 @@ class Cluster: sf = self.sort(tf.name, opts='-k 2') with open(sf) as f: for doc in self.group_by(f, key=cut(f=1)): + if len(doc["v"]) < self.min_cluster_size: + continue self.counter["num_clusters"] += 1 json.dump(doc, self.output) self.output.write("\n") diff --git a/tests/test_cluster.py b/tests/test_cluster.py index ac1edac..3ad32a7 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -160,6 +160,7 @@ def test_cluster(): ] ], release_key_title_normalized, + min_cluster_size=1, output=sio) stats = cluster.run() assert stats == { -- cgit v1.2.3