diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-25 01:22:32 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-25 01:22:32 +0100 |
commit | 6bf0cb8a908122eed9cccd7f9fae35377a692c1d (patch) | |
tree | 587b5c4e9c02fbdceb86001bd3bfd269a372cd1b /fuzzycat | |
parent | 17582f0b1d5e6a33ec353f3ff63f37f0a2764c0c (diff) | |
download | fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.tar.gz fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.zip |
extend test coverage
Diffstat (limited to 'fuzzycat')
-rw-r--r-- | fuzzycat/cluster.py | 39 | ||||
-rw-r--r-- | fuzzycat/utils.py | 26 | ||||
-rw-r--r-- | fuzzycat/verify.py | 5 |
3 files changed, 39 insertions, 31 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index dfc08b7..c23180f 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -70,12 +70,14 @@ import subprocess import sys import tempfile import unicodedata -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple import fuzzy import regex +from fuzzycat.utils import cut, slugify_string + __all__ = [ "release_key_title", "release_key_title_normalized", @@ -97,15 +99,6 @@ class KeyDoc: get_ident_title = operator.itemgetter("ident", "title") ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile(r'[\W_]+', re.UNICODE) -printable_no_punct = string.digits + string.ascii_letters + string.whitespace - - -def slugify_string(s: str) -> str: - """ - Keeps ascii chars and single whitespace only. - """ - return ''.join((c for c in s.lower() if c in printable_no_punct)) - # Notes: untie from release_entity, as we are only using a few fields. Maybe # it's a jsob blob, with a pydantic spec and schema. @@ -303,25 +296,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]: return (ident, key) -def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): - """ - Return a callable, that extracts a given column from a file with a specific - separator. TODO: move this into more generic place. - """ - def func(value): - parts = value.strip().split(sep) - if f >= len(parts): - if ignore_missing_column: - return "" - raise ValueError('cannot split value {} into {} parts'.format(value, f)) - return parts[f] - - return func - - class Cluster: """ - Setup and run clustering over a potentially large number of records. + Setup and run clustering over a potentially large (100m) number of records. """ def __init__(self, iterable: collections.abc.Iterable, @@ -331,7 +308,8 @@ class Cluster: prefix: str = "fuzzycat-", tmpdir: str = tempfile.gettempdir(), strict: bool = False, - max_cluster_size: int = 100): + max_cluster_size: int = 100, + verbose=True): self.iterable: collections.abc.Iterable = iterable self.key: Callable[[Any], Tuple[str, str]] = key self.output: IO[str] = output @@ -340,6 +318,7 @@ class Cluster: self.strict = strict self.key_denylist = key_denylist self.max_cluster_size = max_cluster_size + self.verbose = verbose self.counter: Dict[str, int] = collections.Counter({ "key_fail": 0, "key_ok": 0, @@ -355,13 +334,13 @@ class Cluster: """ with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: for i, line in enumerate(self.iterable): - if i % 100000 == 0: + if i % 100000 == 0 and self.verbose: print("@{}".format(i), file=sys.stderr) try: doc = json.loads(line) id, key = self.key(doc) except (KeyError, ValueError): - if strict: + if self.strict: raise self.counter["key_fail"] += 1 continue diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py new file mode 100644 index 0000000..f269b11 --- /dev/null +++ b/fuzzycat/utils.py @@ -0,0 +1,26 @@ +import io +import string + +printable_no_punct = string.digits + string.ascii_letters + string.whitespace + + +def slugify_string(s: str) -> str: + """ + Keeps ascii chars and single whitespace only. + """ + return ''.join((c for c in s.lower() if c in printable_no_punct)) + + +def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True): + """ + Return a callable that extracts a given column from a line. + """ + def func(value): + parts = value.strip().split(sep) + if f >= len(parts): + if ignore_missing_column: + return "" + raise ValueError('cannot split value {} into {} parts'.format(value, f)) + return parts[f] + + return func diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 26e7b2a..7a7f01f 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -132,7 +132,10 @@ class GroupVerifier: We would need to compare each possible pair and decide whether they are the same. """ - def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=True): + def __init__(self, + iterable: collections.abc.Iterable, + max_cluster_size: int = 10, + verbose=True): self.iterable: collections.abc.Iterable = iterable self.max_cluster_size: int = 10 self.counter = collections.Counter() |