5 files changed, 147 insertions, 32 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index dfc08b7..c23180f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -70,12 +70,14 @@ import subprocess
 import sys
 import tempfile
 import unicodedata
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple
 
 import fuzzy
 import regex
 
+from fuzzycat.utils import cut, slugify_string
+
 __all__ = [
     "release_key_title",
     "release_key_title_normalized",
@@ -97,15 +99,6 @@ class KeyDoc:
 get_ident_title = operator.itemgetter("ident", "title")
 ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-printable_no_punct = string.digits + string.ascii_letters + string.whitespace
-
-
-def slugify_string(s: str) -> str:
-    """
-    Keeps ascii chars and single whitespace only.
-    """
-    return ''.join((c for c in s.lower() if c in printable_no_punct))
-
 
 # Notes: untie from release_entity, as we are only using a few fields. Maybe
 # it's a jsob blob, with a pydantic spec and schema.
@@ -303,25 +296,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
     return (ident, key)
 
 
-def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
-    """
-    Return a callable, that extracts a given column from a file with a specific
-    separator. TODO: move this into more generic place.
-    """
-    def func(value):
-        parts = value.strip().split(sep)
-        if f >= len(parts):
-            if ignore_missing_column:
-                return ""
-            raise ValueError('cannot split value {} into {} parts'.format(value, f))
-        return parts[f]
-
-    return func
-
-
 class Cluster:
     """
-    Setup and run clustering over a potentially large number of records.
+    Setup and run clustering over a potentially large (100m) number of records.
     """
     def __init__(self,
                  iterable: collections.abc.Iterable,
@@ -331,7 +308,8 @@ class Cluster:
                  prefix: str = "fuzzycat-",
                  tmpdir: str = tempfile.gettempdir(),
                  strict: bool = False,
-                 max_cluster_size: int = 100):
+                 max_cluster_size: int = 100,
+                 verbose=True):
         self.iterable: collections.abc.Iterable = iterable
         self.key: Callable[[Any], Tuple[str, str]] = key
         self.output: IO[str] = output
@@ -340,6 +318,7 @@ class Cluster:
         self.strict = strict
         self.key_denylist = key_denylist
         self.max_cluster_size = max_cluster_size
+        self.verbose = verbose
         self.counter: Dict[str, int] = collections.Counter({
             "key_fail": 0,
             "key_ok": 0,
@@ -355,13 +334,13 @@ class Cluster:
         """
         with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
             for i, line in enumerate(self.iterable):
-                if i % 100000 == 0:
+                if i % 100000 == 0 and self.verbose:
                     print("@{}".format(i), file=sys.stderr)
                 try:
                     doc = json.loads(line)
                     id, key = self.key(doc)
                 except (KeyError, ValueError):
-                    if strict:
+                    if self.strict:
                         raise
                     self.counter["key_fail"] += 1
                     continue
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
new file mode 100644
index 0000000..f269b11
--- /dev/null
+++ b/fuzzycat/utils.py
@@ -0,0 +1,26 @@
+import io
+import string
+
+printable_no_punct = string.digits + string.ascii_letters + string.whitespace
+
+
+def slugify_string(s: str) -> str:
+    """
+    Keeps ascii chars and single whitespace only.
+    """
+    return ''.join((c for c in s.lower() if c in printable_no_punct))
+
+
+def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
+    """
+    Return a callable that extracts a given column from a line.
+    """
+    def func(value):
+        parts = value.strip().split(sep)
+        if f >= len(parts):
+            if ignore_missing_column:
+                return ""
+            raise ValueError('cannot split value {} into {} parts'.format(value, f))
+        return parts[f]
+
+    return func
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 26e7b2a..7a7f01f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -132,7 +132,10 @@ class GroupVerifier:
     We would need to compare each possible pair and decide whether they are the
     same.
     """
-    def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=True):
+    def __init__(self,
+                 iterable: collections.abc.Iterable,
+                 max_cluster_size: int = 10,
+                 verbose=True):
         self.iterable: collections.abc.Iterable = iterable
         self.max_cluster_size: int = 10
         self.counter = collections.Counter()
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index f2ae4a4..e5944af 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1,3 +1,5 @@
+import json
+import io
 import collections
 import os
 import tempfile
@@ -5,7 +7,7 @@ import tempfile
 import pytest
 
 from fuzzycat.cluster import (release_key_title, release_key_title_normalized,
-                              release_key_title_nysiis)
+                              release_key_title_nysiis, Cluster)
 
 Case = collections.namedtuple("Case", 'input output')
 
@@ -103,3 +105,85 @@ def test_release_key_title_nysiis():
     for case in cases:
         assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
             case.input)
+
+
+def test_cluster():
+    sio = io.StringIO()
+    cluster = Cluster([
+        json.dumps(line) for line in [
+            {
+                "title": "hello world",
+                "ident": 1
+            },
+            {
+                "title": "hello world!",
+                "ident": 2
+            },
+        ]
+    ],
+                      release_key_title_normalized,
+                      output=sio)
+    stats = cluster.run()
+    assert stats == {
+        "key_fail": 0,
+        "key_ok": 2,
+        "key_empty": 0,
+        "key_denylist": 0,
+        "num_clusters": 1
+    }
+    assert json.loads(sio.getvalue()) == {
+        "k": "helloworld",
+        "v": [{
+            "title": "hello world!",
+            "ident": 2
+        }, {
+            "title": "hello world",
+            "ident": 1
+        }]
+    }
+
+    sio = io.StringIO()
+    cluster = Cluster([
+        json.dumps(line) for line in [
+            {
+                "title": "hello world",
+                "ident": 1
+            },
+            {
+                "title": "hello world!",
+                "ident": 2
+            },
+            {
+                "title": "other",
+                "ident": 3
+            },
+        ]
+    ],
+                      release_key_title_normalized,
+                      output=sio)
+    stats = cluster.run()
+    assert stats == {
+        "key_fail": 0,
+        "key_ok": 3,
+        "key_empty": 0,
+        "key_denylist": 0,
+        "num_clusters": 2
+    }
+    assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
+        "k":
+        "helloworld",
+        "v": [{
+            "title": "hello world!",
+            "ident": 2
+        }, {
+            "title": "hello world",
+            "ident": 1
+        }]
+    }, {
+        'k':
+        'other',
+        'v': [{
+            'ident': 3,
+            'title': 'other'
+        }]
+    }]
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..d0e5d48
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,23 @@
+import pytest
+from fuzzycat.utils import slugify_string, cut
+
+
+def test_slugify_string():
+    assert slugify_string("") == ""
+    assert slugify_string("X") == "x"
+    assert slugify_string("Xx") == "xx"
+    assert slugify_string("Xx x") == "xx x"
+    assert slugify_string("Xx x  x") == "xx x  x"
+    assert slugify_string("Xx?x  x") == "xxx  x"
+    assert slugify_string("Xx? ?x  x") == "xx x  x"
+    assert slugify_string("Xx?_?x--x") == "xxxx"
+    assert slugify_string("=?++*") == ""
+
+
+def test_cut():
+    assert cut()("a	b") == "a"
+    assert cut(1)("a	b") == "b"
+    assert cut(2, sep=',')("a,b,c") == "c"
+    assert cut(3, sep=',')("a,b,c") == ""
+    with pytest.raises(ValueError):
+        cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""