extend test coverage

author: Martin Czygan <martin.czygan@gmail.com> 2020-11-25 01:22:32 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2020-11-25 01:22:32 +0100
commit: 6bf0cb8a908122eed9cccd7f9fae35377a692c1d (patch)
tree: 587b5c4e9c02fbdceb86001bd3bfd269a372cd1b /fuzzycat
parent: 17582f0b1d5e6a33ec353f3ff63f37f0a2764c0c (diff)
download: fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.tar.gz
fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.zip
3 files changed, 39 insertions, 31 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index dfc08b7..c23180f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -70,12 +70,14 @@ import subprocess
 import sys
 import tempfile
 import unicodedata
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple
 
 import fuzzy
 import regex
 
+from fuzzycat.utils import cut, slugify_string
+
 __all__ = [
     "release_key_title",
     "release_key_title_normalized",
@@ -97,15 +99,6 @@ class KeyDoc:
 get_ident_title = operator.itemgetter("ident", "title")
 ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-printable_no_punct = string.digits + string.ascii_letters + string.whitespace
-
-
-def slugify_string(s: str) -> str:
-    """
-    Keeps ascii chars and single whitespace only.
-    """
-    return ''.join((c for c in s.lower() if c in printable_no_punct))
-
 
 # Notes: untie from release_entity, as we are only using a few fields. Maybe
 # it's a jsob blob, with a pydantic spec and schema.
@@ -303,25 +296,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
     return (ident, key)
 
 
-def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
-    """
-    Return a callable, that extracts a given column from a file with a specific
-    separator. TODO: move this into more generic place.
-    """
-    def func(value):
-        parts = value.strip().split(sep)
-        if f >= len(parts):
-            if ignore_missing_column:
-                return ""
-            raise ValueError('cannot split value {} into {} parts'.format(value, f))
-        return parts[f]
-
-    return func
-
-
 class Cluster:
     """
-    Setup and run clustering over a potentially large number of records.
+    Setup and run clustering over a potentially large (100m) number of records.
     """
     def __init__(self,
                  iterable: collections.abc.Iterable,
@@ -331,7 +308,8 @@ class Cluster:
                  prefix: str = "fuzzycat-",
                  tmpdir: str = tempfile.gettempdir(),
                  strict: bool = False,
-                 max_cluster_size: int = 100):
+                 max_cluster_size: int = 100,
+                 verbose=True):
         self.iterable: collections.abc.Iterable = iterable
         self.key: Callable[[Any], Tuple[str, str]] = key
         self.output: IO[str] = output
@@ -340,6 +318,7 @@ class Cluster:
         self.strict = strict
         self.key_denylist = key_denylist
         self.max_cluster_size = max_cluster_size
+        self.verbose = verbose
         self.counter: Dict[str, int] = collections.Counter({
             "key_fail": 0,
             "key_ok": 0,
@@ -355,13 +334,13 @@ class Cluster:
         """
         with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
             for i, line in enumerate(self.iterable):
-                if i % 100000 == 0:
+                if i % 100000 == 0 and self.verbose:
                     print("@{}".format(i), file=sys.stderr)
                 try:
                     doc = json.loads(line)
                     id, key = self.key(doc)
                 except (KeyError, ValueError):
-                    if strict:
+                    if self.strict:
                         raise
                     self.counter["key_fail"] += 1
                     continue
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
new file mode 100644
index 0000000..f269b11
--- /dev/null
+++ b/fuzzycat/utils.py
@@ -0,0 +1,26 @@
+import io
+import string
+
+printable_no_punct = string.digits + string.ascii_letters + string.whitespace
+
+
+def slugify_string(s: str) -> str:
+    """
+    Keeps ascii chars and single whitespace only.
+    """
+    return ''.join((c for c in s.lower() if c in printable_no_punct))
+
+
+def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
+    """
+    Return a callable that extracts a given column from a line.
+    """
+    def func(value):
+        parts = value.strip().split(sep)
+        if f >= len(parts):
+            if ignore_missing_column:
+                return ""
+            raise ValueError('cannot split value {} into {} parts'.format(value, f))
+        return parts[f]
+
+    return func
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 26e7b2a..7a7f01f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -132,7 +132,10 @@ class GroupVerifier:
     We would need to compare each possible pair and decide whether they are the
     same.
     """
-    def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=True):
+    def __init__(self,
+                 iterable: collections.abc.Iterable,
+                 max_cluster_size: int = 10,
+                 verbose=True):
         self.iterable: collections.abc.Iterable = iterable
         self.max_cluster_size: int = 10
         self.counter = collections.Counter()
author	Martin Czygan <martin.czygan@gmail.com>	2020-11-25 01:22:32 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2020-11-25 01:22:32 +0100
commit	6bf0cb8a908122eed9cccd7f9fae35377a692c1d (patch)
tree	587b5c4e9c02fbdceb86001bd3bfd269a372cd1b /fuzzycat
parent	17582f0b1d5e6a33ec353f3ff63f37f0a2764c0c (diff)
download	fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.tar.gz fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.zip