aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-26 21:13:08 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-26 21:13:08 +0100
commit5962f27aced29bc40efe82113fe057e302b68fa4 (patch)
treee0fc4b1cb3cfcdf5fcf4e6b532e1101527742a80
parentacdbdd74783f892843170c34cdd39ede7f0bae4b (diff)
downloadfuzzycat-5962f27aced29bc40efe82113fe057e302b68fa4.tar.gz
fuzzycat-5962f27aced29bc40efe82113fe057e302b68fa4.zip
add --min-cluster-size flag to cluster subcommand
-rw-r--r--fuzzycat/__main__.py4
-rw-r--r--fuzzycat/cluster.py4
-rw-r--r--tests/test_cluster.py1
3 files changed, 9 insertions, 0 deletions
diff --git a/fuzzycat/__main__.py b/fuzzycat/__main__.py
index 3a3b3ba..70ed1ab 100644
--- a/fuzzycat/__main__.py
+++ b/fuzzycat/__main__.py
@@ -77,6 +77,10 @@ if __name__ == '__main__':
sub_cluster.set_defaults(func=run_cluster)
sub_cluster.add_argument('-f', '--files', default="-", help='input files')
sub_cluster.add_argument('--key-denylist', help='file path to key denylist')
+ sub_cluster.add_argument('--min-cluster-size',
+ default=2,
+ type=int,
+ help='ignore smaller clusters')
sub_cluster.add_argument('-t',
'--type',
default='title',
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 7843577..10bb431 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -311,6 +311,7 @@ class Cluster:
prefix: str = "fuzzycat-",
tmpdir: str = tempfile.gettempdir(),
strict: bool = False,
+ min_cluster_size: int = 2,
max_cluster_size: int = 100,
verbose=True):
self.iterable: collections.abc.Iterable = iterable
@@ -320,6 +321,7 @@ class Cluster:
self.tmpdir: str = tmpdir
self.strict = strict
self.key_denylist = key_denylist
+ self.min_cluster_size = min_cluster_size
self.max_cluster_size = max_cluster_size
self.verbose = verbose
self.counter: Dict[str, int] = collections.Counter({
@@ -361,6 +363,8 @@ class Cluster:
sf = self.sort(tf.name, opts='-k 2')
with open(sf) as f:
for doc in self.group_by(f, key=cut(f=1)):
+ if len(doc["v"]) < self.min_cluster_size:
+ continue
self.counter["num_clusters"] += 1
json.dump(doc, self.output)
self.output.write("\n")
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index ac1edac..3ad32a7 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -160,6 +160,7 @@ def test_cluster():
]
],
release_key_title_normalized,
+ min_cluster_size=1,
output=sio)
stats = cluster.run()
assert stats == {