diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-11-13 23:05:57 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-11-13 23:05:57 +0100 | 
| commit | 71a1d68f2f512c7ad4cd0cdb0bdcc65fca7794bf (patch) | |
| tree | d876fcf32ee276acf62fbc512f8f973f54515b2f | |
| parent | 1f91606501754bf8d3fa8b3075a05c147470c7bb (diff) | |
| download | fuzzycat-71a1d68f2f512c7ad4cd0cdb0bdcc65fca7794bf.tar.gz fuzzycat-71a1d68f2f512c7ad4cd0cdb0bdcc65fca7794bf.zip | |
fix tests
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | fuzzycat/verify.py | 1 | ||||
| -rw-r--r-- | tests/test_cluster.py | 52 | 
4 files changed, 4 insertions, 55 deletions
| @@ -31,7 +31,7 @@ cov: ## Run coverage report  .PHONY: test  test: ## Run coverage report -	pytest -v fuzzycat/*.py tests/ +	pytest -v fuzzycat/*.py tests/*.py  .PHONY: lint  lint: $(PY_FILES) @@ -21,7 +21,7 @@ a cache before ops)  Release clusters start with release entities json lines.  ```shell -$ cat data/sample.json | python -m fuzzycat.main cluster -t title > out.json +$ cat data/sample.json | python -m fuzzycat cluster -t title > out.json  ```  Clustering 1M records (single core) takes about 64s (15K docs/s). @@ -54,7 +54,7 @@ $ cat data/sample.json | parallel -j 8 --pipe --roundrobin python -m fuzzycat.ma  ```  Interestingly, the parallel variants detects fewer clusters (because data is -split and clusters are searched within each batch). +split and clusters are searched within each batch). TODO(miku): sort out sharding bug.  ## Cluster diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 55b8ef6..1a0fb95 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -149,7 +149,6 @@ class GroupVerifier:          print(json.dumps(dict(self.counter))) -      def compare(self, a, b):          """          We compare two release entities here. diff --git a/tests/test_cluster.py b/tests/test_cluster.py index a010c04..f673206 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,7 +1,7 @@  import pytest  import collections  import tempfile -from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by +from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis  import os  Case = collections.namedtuple("Case", 'input output') @@ -100,53 +100,3 @@ def test_release_key_title_nysiis():      for case in cases:          assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(              case.input) - - -def test_release_key_title_authors_ngram(): -    pass - - -def test_sort_by_column(): -    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: -        tf.write("2 b\n") -        tf.write("2 a\n") -        tf.write("9 d\n") -        tf.write("1 c\n") - -    fn = sort_by_column(tf.name, opts='-k 2') -    with open(fn) as f: -        lines = [v.strip() for v in f] -        assert lines == ['2 a', '2 b', '1 c', '9 d'] - -    fn = sort_by_column(tf.name, opts='-k 1') -    with open(fn) as f: -        lines = [v.strip() for v in f] -        assert lines == ['1 c', '2 a', '2 b', '9 d'] - -    fn = sort_by_column(tf.name, opts='-k 3') -    with open(fn) as f: -        lines = [v.strip() for v in f] -        assert lines == ['1 c', '2 a', '2 b', '9 d'] - - -def test_group_by(): -    Case = collections.namedtuple("Case", "seq keyfunc valuefunc result") -    cases = ( -        Case(["0", "1"], lambda v: v, lambda v: v, [{ -            'k': '0', -            'v': ['0'] -        }, { -            'k': '1', -            'v': ['1'] -        }]), -        Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{ -            'k': 'a', -            'v': ['1', '2'] -        }, { -            'k': 'b', -            'v': ['3'] -        }]), -    ) - -    for case in cases: -        assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc)) | 
