From 71a1d68f2f512c7ad4cd0cdb0bdcc65fca7794bf Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Fri, 13 Nov 2020 23:05:57 +0100
Subject: fix tests

---
 Makefile              |  2 +-
 README.md             |  4 ++--
 fuzzycat/verify.py    |  1 -
 tests/test_cluster.py | 52 +--------------------------------------------------
 4 files changed, 4 insertions(+), 55 deletions(-)

diff --git a/Makefile b/Makefile
index 25efac0..ff7cb30 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ cov: ## Run coverage report
 
 .PHONY: test
 test: ## Run coverage report
-	pytest -v fuzzycat/*.py tests/
+	pytest -v fuzzycat/*.py tests/*.py
 
 .PHONY: lint
 lint: $(PY_FILES)
diff --git a/README.md b/README.md
index 7e27f63..33984b1 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ a cache before ops)
 Release clusters start with release entities json lines.
 
 ```shell
-$ cat data/sample.json | python -m fuzzycat.main cluster -t title > out.json
+$ cat data/sample.json | python -m fuzzycat cluster -t title > out.json
 ```
 
 Clustering 1M records (single core) takes about 64s (15K docs/s).
@@ -54,7 +54,7 @@ $ cat data/sample.json | parallel -j 8 --pipe --roundrobin python -m fuzzycat.ma
 ```
 
 Interestingly, the parallel variants detects fewer clusters (because data is
-split and clusters are searched within each batch).
+split and clusters are searched within each batch). TODO(miku): sort out sharding bug.
 
 
 ## Cluster
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 55b8ef6..1a0fb95 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -149,7 +149,6 @@ class GroupVerifier:
 
         print(json.dumps(dict(self.counter)))
 
-
     def compare(self, a, b):
         """
         We compare two release entities here.
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index a010c04..f673206 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1,7 +1,7 @@
 import pytest
 import collections
 import tempfile
-from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by
+from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis
 import os
 
 Case = collections.namedtuple("Case", 'input output')
@@ -100,53 +100,3 @@ def test_release_key_title_nysiis():
     for case in cases:
         assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
             case.input)
-
-
-def test_release_key_title_authors_ngram():
-    pass
-
-
-def test_sort_by_column():
-    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
-        tf.write("2 b\n")
-        tf.write("2 a\n")
-        tf.write("9 d\n")
-        tf.write("1 c\n")
-
-    fn = sort_by_column(tf.name, opts='-k 2')
-    with open(fn) as f:
-        lines = [v.strip() for v in f]
-        assert lines == ['2 a', '2 b', '1 c', '9 d']
-
-    fn = sort_by_column(tf.name, opts='-k 1')
-    with open(fn) as f:
-        lines = [v.strip() for v in f]
-        assert lines == ['1 c', '2 a', '2 b', '9 d']
-
-    fn = sort_by_column(tf.name, opts='-k 3')
-    with open(fn) as f:
-        lines = [v.strip() for v in f]
-        assert lines == ['1 c', '2 a', '2 b', '9 d']
-
-
-def test_group_by():
-    Case = collections.namedtuple("Case", "seq keyfunc valuefunc result")
-    cases = (
-        Case(["0", "1"], lambda v: v, lambda v: v, [{
-            'k': '0',
-            'v': ['0']
-        }, {
-            'k': '1',
-            'v': ['1']
-        }]),
-        Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{
-            'k': 'a',
-            'v': ['1', '2']
-        }, {
-            'k': 'b',
-            'v': ['3']
-        }]),
-    )
-
-    for case in cases:
-        assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc))
-- 
cgit v1.2.3