diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-23 17:41:04 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-23 17:41:04 +0100 |
commit | a2692ab56ff77cba165d37bb5250046a01338268 (patch) | |
tree | 493b7365bd13a23207283c4d6ea642aa57d5a492 | |
parent | fa272a05f36c6d9e5a0eb628bb9a687deefb3e20 (diff) | |
download | fuzzycat-a2692ab56ff77cba165d37bb5250046a01338268.tar.gz fuzzycat-a2692ab56ff77cba165d37bb5250046a01338268.zip |
add max_cluster_size
-rw-r--r-- | fuzzycat/cluster.py | 12 |
1 files changed, 8 insertions, 4 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 59214a1..f3d547a 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -356,7 +356,8 @@ class Cluster: key_denylist: Optional[List[str]] = None, prefix: str = "fuzzycat-", tmpdir: str = tempfile.gettempdir(), - strict: bool = False): + strict: bool = False, + max_cluster_size=100): self.iterable: collections.abc.Iterable = iterable self.key: Callable[[Any], Tuple[str, str]] = key self.output: IO[str] = output @@ -371,6 +372,7 @@ class Cluster: }) self.strict = strict self.key_denylist = key_denylist + self.max_cluster_size = max_cluster_size def run(self): """ @@ -437,10 +439,12 @@ class Cluster: There might be large clusters, which would currently exceed memory. Mitigate by splitting large clusters into parts. """ - for k, g in itertools.groupby(seq, key=key): - items = list(g) + for k, g in enumerate(itertools.groupby(seq, key=key): payload = [] - for line in items: + for i, line in enumerate(g): + if i > 0 and i == self.max_cluster_size: + print('max cluster size cut off for: {}'.format(k)) + break # XXX: This is a bit too much "serde", get rid of this. fields = line.split("\t") if len(fields) < 3: |