From 71a1d68f2f512c7ad4cd0cdb0bdcc65fca7794bf Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 13 Nov 2020 23:05:57 +0100 Subject: fix tests --- Makefile | 2 +- README.md | 4 ++-- fuzzycat/verify.py | 1 - tests/test_cluster.py | 52 +-------------------------------------------------- 4 files changed, 4 insertions(+), 55 deletions(-) diff --git a/Makefile b/Makefile index 25efac0..ff7cb30 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ cov: ## Run coverage report .PHONY: test test: ## Run coverage report - pytest -v fuzzycat/*.py tests/ + pytest -v fuzzycat/*.py tests/*.py .PHONY: lint lint: $(PY_FILES) diff --git a/README.md b/README.md index 7e27f63..33984b1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ a cache before ops) Release clusters start with release entities json lines. ```shell -$ cat data/sample.json | python -m fuzzycat.main cluster -t title > out.json +$ cat data/sample.json | python -m fuzzycat cluster -t title > out.json ``` Clustering 1M records (single core) takes about 64s (15K docs/s). @@ -54,7 +54,7 @@ $ cat data/sample.json | parallel -j 8 --pipe --roundrobin python -m fuzzycat.ma ``` Interestingly, the parallel variants detects fewer clusters (because data is -split and clusters are searched within each batch). +split and clusters are searched within each batch). TODO(miku): sort out sharding bug. ## Cluster diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 55b8ef6..1a0fb95 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -149,7 +149,6 @@ class GroupVerifier: print(json.dumps(dict(self.counter))) - def compare(self, a, b): """ We compare two release entities here. diff --git a/tests/test_cluster.py b/tests/test_cluster.py index a010c04..f673206 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,7 +1,7 @@ import pytest import collections import tempfile -from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by +from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis import os Case = collections.namedtuple("Case", 'input output') @@ -100,53 +100,3 @@ def test_release_key_title_nysiis(): for case in cases: assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( case.input) - - -def test_release_key_title_authors_ngram(): - pass - - -def test_sort_by_column(): - with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: - tf.write("2 b\n") - tf.write("2 a\n") - tf.write("9 d\n") - tf.write("1 c\n") - - fn = sort_by_column(tf.name, opts='-k 2') - with open(fn) as f: - lines = [v.strip() for v in f] - assert lines == ['2 a', '2 b', '1 c', '9 d'] - - fn = sort_by_column(tf.name, opts='-k 1') - with open(fn) as f: - lines = [v.strip() for v in f] - assert lines == ['1 c', '2 a', '2 b', '9 d'] - - fn = sort_by_column(tf.name, opts='-k 3') - with open(fn) as f: - lines = [v.strip() for v in f] - assert lines == ['1 c', '2 a', '2 b', '9 d'] - - -def test_group_by(): - Case = collections.namedtuple("Case", "seq keyfunc valuefunc result") - cases = ( - Case(["0", "1"], lambda v: v, lambda v: v, [{ - 'k': '0', - 'v': ['0'] - }, { - 'k': '1', - 'v': ['1'] - }]), - Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{ - 'k': 'a', - 'v': ['1', '2'] - }, { - 'k': 'b', - 'v': ['3'] - }]), - ) - - for case in cases: - assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc)) -- cgit v1.2.3