aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py39
-rw-r--r--fuzzycat/utils.py26
-rw-r--r--fuzzycat/verify.py5
-rw-r--r--tests/test_cluster.py86
-rw-r--r--tests/test_utils.py23
5 files changed, 147 insertions, 32 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index dfc08b7..c23180f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -70,12 +70,14 @@ import subprocess
import sys
import tempfile
import unicodedata
-from dataclasses import dataclass, field
+from dataclasses import dataclass
from typing import IO, Any, Callable, Dict, Generator, List, Optional, Tuple
import fuzzy
import regex
+from fuzzycat.utils import cut, slugify_string
+
__all__ = [
"release_key_title",
"release_key_title_normalized",
@@ -97,15 +99,6 @@ class KeyDoc:
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
-printable_no_punct = string.digits + string.ascii_letters + string.whitespace
-
-
-def slugify_string(s: str) -> str:
- """
- Keeps ascii chars and single whitespace only.
- """
- return ''.join((c for c in s.lower() if c in printable_no_punct))
-
# Notes: untie from release_entity, as we are only using a few fields. Maybe
# it's a jsob blob, with a pydantic spec and schema.
@@ -303,25 +296,9 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
return (ident, key)
-def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
- """
- Return a callable, that extracts a given column from a file with a specific
- separator. TODO: move this into more generic place.
- """
- def func(value):
- parts = value.strip().split(sep)
- if f >= len(parts):
- if ignore_missing_column:
- return ""
- raise ValueError('cannot split value {} into {} parts'.format(value, f))
- return parts[f]
-
- return func
-
-
class Cluster:
"""
- Setup and run clustering over a potentially large number of records.
+ Setup and run clustering over a potentially large (100m) number of records.
"""
def __init__(self,
iterable: collections.abc.Iterable,
@@ -331,7 +308,8 @@ class Cluster:
prefix: str = "fuzzycat-",
tmpdir: str = tempfile.gettempdir(),
strict: bool = False,
- max_cluster_size: int = 100):
+ max_cluster_size: int = 100,
+ verbose=True):
self.iterable: collections.abc.Iterable = iterable
self.key: Callable[[Any], Tuple[str, str]] = key
self.output: IO[str] = output
@@ -340,6 +318,7 @@ class Cluster:
self.strict = strict
self.key_denylist = key_denylist
self.max_cluster_size = max_cluster_size
+ self.verbose = verbose
self.counter: Dict[str, int] = collections.Counter({
"key_fail": 0,
"key_ok": 0,
@@ -355,13 +334,13 @@ class Cluster:
"""
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
for i, line in enumerate(self.iterable):
- if i % 100000 == 0:
+ if i % 100000 == 0 and self.verbose:
print("@{}".format(i), file=sys.stderr)
try:
doc = json.loads(line)
id, key = self.key(doc)
except (KeyError, ValueError):
- if strict:
+ if self.strict:
raise
self.counter["key_fail"] += 1
continue
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
new file mode 100644
index 0000000..f269b11
--- /dev/null
+++ b/fuzzycat/utils.py
@@ -0,0 +1,26 @@
+import io
+import string
+
+printable_no_punct = string.digits + string.ascii_letters + string.whitespace
+
+
+def slugify_string(s: str) -> str:
+ """
+ Keeps ascii chars and single whitespace only.
+ """
+ return ''.join((c for c in s.lower() if c in printable_no_punct))
+
+
+def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
+ """
+ Return a callable that extracts a given column from a line.
+ """
+ def func(value):
+ parts = value.strip().split(sep)
+ if f >= len(parts):
+ if ignore_missing_column:
+ return ""
+ raise ValueError('cannot split value {} into {} parts'.format(value, f))
+ return parts[f]
+
+ return func
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 26e7b2a..7a7f01f 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -132,7 +132,10 @@ class GroupVerifier:
We would need to compare each possible pair and decide whether they are the
same.
"""
- def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=True):
+ def __init__(self,
+ iterable: collections.abc.Iterable,
+ max_cluster_size: int = 10,
+ verbose=True):
self.iterable: collections.abc.Iterable = iterable
self.max_cluster_size: int = 10
self.counter = collections.Counter()
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index f2ae4a4..e5944af 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1,3 +1,5 @@
+import json
+import io
import collections
import os
import tempfile
@@ -5,7 +7,7 @@ import tempfile
import pytest
from fuzzycat.cluster import (release_key_title, release_key_title_normalized,
- release_key_title_nysiis)
+ release_key_title_nysiis, Cluster)
Case = collections.namedtuple("Case", 'input output')
@@ -103,3 +105,85 @@ def test_release_key_title_nysiis():
for case in cases:
assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
case.input)
+
+
+def test_cluster():
+ sio = io.StringIO()
+ cluster = Cluster([
+ json.dumps(line) for line in [
+ {
+ "title": "hello world",
+ "ident": 1
+ },
+ {
+ "title": "hello world!",
+ "ident": 2
+ },
+ ]
+ ],
+ release_key_title_normalized,
+ output=sio)
+ stats = cluster.run()
+ assert stats == {
+ "key_fail": 0,
+ "key_ok": 2,
+ "key_empty": 0,
+ "key_denylist": 0,
+ "num_clusters": 1
+ }
+ assert json.loads(sio.getvalue()) == {
+ "k": "helloworld",
+ "v": [{
+ "title": "hello world!",
+ "ident": 2
+ }, {
+ "title": "hello world",
+ "ident": 1
+ }]
+ }
+
+ sio = io.StringIO()
+ cluster = Cluster([
+ json.dumps(line) for line in [
+ {
+ "title": "hello world",
+ "ident": 1
+ },
+ {
+ "title": "hello world!",
+ "ident": 2
+ },
+ {
+ "title": "other",
+ "ident": 3
+ },
+ ]
+ ],
+ release_key_title_normalized,
+ output=sio)
+ stats = cluster.run()
+ assert stats == {
+ "key_fail": 0,
+ "key_ok": 3,
+ "key_empty": 0,
+ "key_denylist": 0,
+ "num_clusters": 2
+ }
+ assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
+ "k":
+ "helloworld",
+ "v": [{
+ "title": "hello world!",
+ "ident": 2
+ }, {
+ "title": "hello world",
+ "ident": 1
+ }]
+ }, {
+ 'k':
+ 'other',
+ 'v': [{
+ 'ident': 3,
+ 'title': 'other'
+ }]
+ }]
diff --git a/tests/test_utils.py b/tests/test_utils.py
new file mode 100644
index 0000000..d0e5d48
--- /dev/null
+++ b/tests/test_utils.py
@@ -0,0 +1,23 @@
+import pytest
+from fuzzycat.utils import slugify_string, cut
+
+
+def test_slugify_string():
+ assert slugify_string("") == ""
+ assert slugify_string("X") == "x"
+ assert slugify_string("Xx") == "xx"
+ assert slugify_string("Xx x") == "xx x"
+ assert slugify_string("Xx x x") == "xx x x"
+ assert slugify_string("Xx?x x") == "xxx x"
+ assert slugify_string("Xx? ?x x") == "xx x x"
+ assert slugify_string("Xx?_?x--x") == "xxxx"
+ assert slugify_string("=?++*") == ""
+
+
+def test_cut():
+ assert cut()("a b") == "a"
+ assert cut(1)("a b") == "b"
+ assert cut(2, sep=',')("a,b,c") == "c"
+ assert cut(3, sep=',')("a,b,c") == ""
+ with pytest.raises(ValueError):
+ cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""