diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | README.md | 4 | ||||
-rw-r--r-- | fuzzycat/verify.py | 1 | ||||
-rw-r--r-- | tests/test_cluster.py | 52 |
4 files changed, 4 insertions, 55 deletions
@@ -31,7 +31,7 @@ cov: ## Run coverage report .PHONY: test test: ## Run coverage report - pytest -v fuzzycat/*.py tests/ + pytest -v fuzzycat/*.py tests/*.py .PHONY: lint lint: $(PY_FILES) @@ -21,7 +21,7 @@ a cache before ops) Release clusters start with release entities json lines. ```shell -$ cat data/sample.json | python -m fuzzycat.main cluster -t title > out.json +$ cat data/sample.json | python -m fuzzycat cluster -t title > out.json ``` Clustering 1M records (single core) takes about 64s (15K docs/s). @@ -54,7 +54,7 @@ $ cat data/sample.json | parallel -j 8 --pipe --roundrobin python -m fuzzycat.ma ``` Interestingly, the parallel variants detects fewer clusters (because data is -split and clusters are searched within each batch). +split and clusters are searched within each batch). TODO(miku): sort out sharding bug. ## Cluster diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 55b8ef6..1a0fb95 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -149,7 +149,6 @@ class GroupVerifier: print(json.dumps(dict(self.counter))) - def compare(self, a, b): """ We compare two release entities here. diff --git a/tests/test_cluster.py b/tests/test_cluster.py index a010c04..f673206 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,7 +1,7 @@ import pytest import collections import tempfile -from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by +from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis import os Case = collections.namedtuple("Case", 'input output') @@ -100,53 +100,3 @@ def test_release_key_title_nysiis(): for case in cases: assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( case.input) - - -def test_release_key_title_authors_ngram(): - pass - - -def test_sort_by_column(): - with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: - tf.write("2 b\n") - tf.write("2 a\n") - tf.write("9 d\n") - tf.write("1 c\n") - - fn = sort_by_column(tf.name, opts='-k 2') - with open(fn) as f: - lines = [v.strip() for v in f] - assert lines == ['2 a', '2 b', '1 c', '9 d'] - - fn = sort_by_column(tf.name, opts='-k 1') - with open(fn) as f: - lines = [v.strip() for v in f] - assert lines == ['1 c', '2 a', '2 b', '9 d'] - - fn = sort_by_column(tf.name, opts='-k 3') - with open(fn) as f: - lines = [v.strip() for v in f] - assert lines == ['1 c', '2 a', '2 b', '9 d'] - - -def test_group_by(): - Case = collections.namedtuple("Case", "seq keyfunc valuefunc result") - cases = ( - Case(["0", "1"], lambda v: v, lambda v: v, [{ - 'k': '0', - 'v': ['0'] - }, { - 'k': '1', - 'v': ['1'] - }]), - Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{ - 'k': 'a', - 'v': ['1', '2'] - }, { - 'k': 'b', - 'v': ['3'] - }]), - ) - - for case in cases: - assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc)) |