From 6bf0cb8a908122eed9cccd7f9fae35377a692c1d Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 25 Nov 2020 01:22:32 +0100 Subject: extend test coverage --- fuzzycat/cluster.py | 39 ++++++----------------- fuzzycat/utils.py | 26 ++++++++++++++++ fuzzycat/verify.py | 5 ++- tests/test_cluster.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++- tests/test_utils.py | 23 ++++++++++++++ 5 files changed, 147 insertions(+), 32 deletions(-) create mode 100644 fuzzycat/utils.py create mode 100644 tests/test_utils.py diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index dfc08b7..c23180f 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -70,12 +70,14 @@ import subprocess import sys import tempfile import unicodedata -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple import fuzzy import regex +from fuzzycat.utils import cut, slugify_string + __all__ = [ "release_key_title", "release_key_title_normalized", @@ -97,15 +99,6 @@ class KeyDoc: get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) -printable_no_punct = string.digits + string.ascii_letters + string.whitespace - - -def slugify_string(s: str) -> str: - """ - Keeps ascii chars and single whitespace only. - """ - return ''.join((c for c in s.lower() if c in printable_no_punct)) - # Notes: untie from release_entity, as we are only using a few fields. Maybe # it's a jsob blob, with a pydantic spec and schema. @@ -303,25 +296,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: return (ident, key) -def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): - """ - Return a callable, that extracts a given column from a file with a specific - separator. TODO: move this into more generic place. - """ - def func(value): - parts = value.strip().split(sep) - if f >= len(parts): - if ignore_missing_column: - return "" - raise ValueError('cannot split value {} into {} parts'.format(value, f)) - return parts[f] - - return func - - class Cluster: """ - Setup and run clustering over a potentially large number of records. + Setup and run clustering over a potentially large (100m) number of records. """ def __init__(self, iterable: collections.abc.Iterable, @@ -331,7 +308,8 @@ class Cluster: prefix: str = "fuzzycat-", tmpdir: str = tempfile.gettempdir(), strict: bool = False, - max_cluster_size: int = 100): + max_cluster_size: int = 100, + verbose=True): self.iterable: collections.abc.Iterable = iterable self.key: Callable[[Any], Tuple[str, str]] = key self.output: IO[str] = output @@ -340,6 +318,7 @@ class Cluster: self.strict = strict self.key_denylist = key_denylist self.max_cluster_size = max_cluster_size + self.verbose = verbose self.counter: Dict[str, int] = collections.Counter({ "key_fail": 0, "key_ok": 0, @@ -355,13 +334,13 @@ class Cluster: """ with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: for i, line in enumerate(self.iterable): - if i % 100000 == 0: + if i % 100000 == 0 and self.verbose: print("@{}".format(i), file=sys.stderr) try: doc = json.loads(line) id, key = self.key(doc) except (KeyError, ValueError): - if strict: + if self.strict: raise self.counter["key_fail"] += 1 continue diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py new file mode 100644 index 0000000..f269b11 --- /dev/null +++ b/fuzzycat/utils.py @@ -0,0 +1,26 @@ +import io +import string + +printable_no_punct = string.digits + string.ascii_letters + string.whitespace + + +def slugify_string(s: str) -> str: + """ + Keeps ascii chars and single whitespace only. + """ + return ''.join((c for c in s.lower() if c in printable_no_punct)) + + +def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): + """ + Return a callable that extracts a given column from a line. + """ + def func(value): + parts = value.strip().split(sep) + if f >= len(parts): + if ignore_missing_column: + return "" + raise ValueError('cannot split value {} into {} parts'.format(value, f)) + return parts[f] + + return func diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 26e7b2a..7a7f01f 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -132,7 +132,10 @@ class GroupVerifier: We would need to compare each possible pair and decide whether they are the same. """ - def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=True): + def __init__(self, + iterable: collections.abc.Iterable, + max_cluster_size: int = 10, + verbose=True): self.iterable: collections.abc.Iterable = iterable self.max_cluster_size: int = 10 self.counter = collections.Counter() diff --git a/tests/test_cluster.py b/tests/test_cluster.py index f2ae4a4..e5944af 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,3 +1,5 @@ +import json +import io import collections import os import tempfile @@ -5,7 +7,7 @@ import tempfile import pytest from fuzzycat.cluster import (release_key_title, release_key_title_normalized, - release_key_title_nysiis) + release_key_title_nysiis, Cluster) Case = collections.namedtuple("Case", 'input output') @@ -103,3 +105,85 @@ def test_release_key_title_nysiis(): for case in cases: assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( case.input) + + +def test_cluster(): + sio = io.StringIO() + cluster = Cluster([ + json.dumps(line) for line in [ + { + "title": "hello world", + "ident": 1 + }, + { + "title": "hello world!", + "ident": 2 + }, + ] + ], + release_key_title_normalized, + output=sio) + stats = cluster.run() + assert stats == { + "key_fail": 0, + "key_ok": 2, + "key_empty": 0, + "key_denylist": 0, + "num_clusters": 1 + } + assert json.loads(sio.getvalue()) == { + "k": "helloworld", + "v": [{ + "title": "hello world!", + "ident": 2 + }, { + "title": "hello world", + "ident": 1 + }] + } + + sio = io.StringIO() + cluster = Cluster([ + json.dumps(line) for line in [ + { + "title": "hello world", + "ident": 1 + }, + { + "title": "hello world!", + "ident": 2 + }, + { + "title": "other", + "ident": 3 + }, + ] + ], + release_key_title_normalized, + output=sio) + stats = cluster.run() + assert stats == { + "key_fail": 0, + "key_ok": 3, + "key_empty": 0, + "key_denylist": 0, + "num_clusters": 2 + } + assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{ + "k": + "helloworld", + "v": [{ + "title": "hello world!", + "ident": 2 + }, { + "title": "hello world", + "ident": 1 + }] + }, { + 'k': + 'other', + 'v': [{ + 'ident': 3, + 'title': 'other' + }] + }] diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..d0e5d48 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,23 @@ +import pytest +from fuzzycat.utils import slugify_string, cut + + +def test_slugify_string(): + assert slugify_string("") == "" + assert slugify_string("X") == "x" + assert slugify_string("Xx") == "xx" + assert slugify_string("Xx x") == "xx x" + assert slugify_string("Xx x x") == "xx x x" + assert slugify_string("Xx?x x") == "xxx x" + assert slugify_string("Xx? ?x x") == "xx x x" + assert slugify_string("Xx?_?x--x") == "xxxx" + assert slugify_string("=?++*") == "" + + +def test_cut(): + assert cut()("a b") == "a" + assert cut(1)("a b") == "b" + assert cut(2, sep=',')("a,b,c") == "c" + assert cut(3, sep=',')("a,b,c") == "" + with pytest.raises(ValueError): + cut(3, sep=',', ignore_missing_column=False)("a,b,c") == "" -- cgit v1.2.3