aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-25 01:22:32 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-25 01:22:32 +0100
commit6bf0cb8a908122eed9cccd7f9fae35377a692c1d (patch)
tree587b5c4e9c02fbdceb86001bd3bfd269a372cd1b /fuzzycat
parent17582f0b1d5e6a33ec353f3ff63f37f0a2764c0c (diff)
downloadfuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.tar.gz
fuzzycat-6bf0cb8a908122eed9cccd7f9fae35377a692c1d.zip
extend test coverage
Diffstat (limited to 'fuzzycat')
-rw-r--r--fuzzycat/cluster.py39
-rw-r--r--fuzzycat/utils.py26
-rw-r--r--fuzzycat/verify.py5
3 files changed, 39 insertions, 31 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index dfc08b7..c23180f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -70,12 +70,14 @@ import subprocess
import sys
import tempfile
import unicodedata
-from dataclasses import dataclass, field
+from dataclasses import dataclass
from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple
import fuzzy
import regex
+from fuzzycat.utils import cut, slugify_string
+
__all__ = [
"release_key_title",
"release_key_title_normalized",
@@ -97,15 +99,6 @@ class KeyDoc:
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-printable_no_punct = string.digits + string.ascii_letters + string.whitespace
-
-
-def slugify_string(s: str) -> str:
- """
- Keeps ascii chars and single whitespace only.
- """
- return ''.join((c for c in s.lower() if c in printable_no_punct))
-
# Notes: untie from release_entity, as we are only using a few fields. Maybe
# it's a jsob blob, with a pydantic spec and schema.
@@ -303,25 +296,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
return (ident, key)
-def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
- """
- Return a callable, that extracts a given column from a file with a specific
- separator. TODO: move this into more generic place.
- """
- def func(value):
- parts = value.strip().split(sep)
- if f >= len(parts):
- if ignore_missing_column:
- return ""
- raise ValueError('cannot split value {} into {} parts'.format(value, f))
- return parts[f]
-
- return func
-
-
class Cluster:
"""
- Setup and run clustering over a potentially large number of records.
+ Setup and run clustering over a potentially large (100m) number of records.
"""
def __init__(self,
iterable: collections.abc.Iterable,
@@ -331,7 +308,8 @@ class Cluster:
prefix: str = "fuzzycat-",
tmpdir: str = tempfile.gettempdir(),
strict: bool = False,
- max_cluster_size: int = 100):
+ max_cluster_size: int = 100,
+ verbose=True):
self.iterable: collections.abc.Iterable = iterable
self.key: Callable[[Any], Tuple[str, str]] = key
self.output: IO[str] = output
@@ -340,6 +318,7 @@ class Cluster:
self.strict = strict
self.key_denylist = key_denylist
self.max_cluster_size = max_cluster_size
+ self.verbose = verbose
self.counter: Dict[str, int] = collections.Counter({
"key_fail": 0,
"key_ok": 0,
@@ -355,13 +334,13 @@ class Cluster:
"""
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for i, line in enumerate(self.iterable):
- if i % 100000 == 0:
+ if i % 100000 == 0 and self.verbose:
print("@{}".format(i), file=sys.stderr)
try:
doc = json.loads(line)
id, key = self.key(doc)
except (KeyError, ValueError):
- if strict:
+ if self.strict:
raise
self.counter["key_fail"] += 1
continue
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
new file mode 100644
index 0000000..f269b11
--- /dev/null
+++ b/fuzzycat/utils.py
@@ -0,0 +1,26 @@
+import io
+import string
+
+printable_no_punct = string.digits + string.ascii_letters + string.whitespace
+
+
+def slugify_string(s: str) -> str:
+ """
+ Keeps ascii chars and single whitespace only.
+ """
+ return ''.join((c for c in s.lower() if c in printable_no_punct))
+
+
+def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
+ """
+ Return a callable that extracts a given column from a line.
+ """
+ def func(value):
+ parts = value.strip().split(sep)
+ if f >= len(parts):
+ if ignore_missing_column:
+ return ""
+ raise ValueError('cannot split value {} into {} parts'.format(value, f))
+ return parts[f]
+
+ return func
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 26e7b2a..7a7f01f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -132,7 +132,10 @@ class GroupVerifier:
We would need to compare each possible pair and decide whether they are the
same.
"""
- def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=True):
+ def __init__(self,
+ iterable: collections.abc.Iterable,
+ max_cluster_size: int = 10,
+ verbose=True):
self.iterable: collections.abc.Iterable = iterable
self.max_cluster_size: int = 10
self.counter = collections.Counter()