aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--README.md4
-rw-r--r--fuzzycat/verify.py1
-rw-r--r--tests/test_cluster.py52
4 files changed, 4 insertions, 55 deletions
diff --git a/Makefile b/Makefile
index 25efac0..ff7cb30 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ cov: ## Run coverage report
.PHONY: test
test: ## Run coverage report
- pytest -v fuzzycat/*.py tests/
+ pytest -v fuzzycat/*.py tests/*.py
.PHONY: lint
lint: $(PY_FILES)
diff --git a/README.md b/README.md
index 7e27f63..33984b1 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ a cache before ops)
Release clusters start with release entities json lines.
```shell
-$ cat data/sample.json | python -m fuzzycat.main cluster -t title > out.json
+$ cat data/sample.json | python -m fuzzycat cluster -t title > out.json
```
Clustering 1M records (single core) takes about 64s (15K docs/s).
@@ -54,7 +54,7 @@ $ cat data/sample.json | parallel -j 8 --pipe --roundrobin python -m fuzzycat.ma
```
Interestingly, the parallel variants detects fewer clusters (because data is
-split and clusters are searched within each batch).
+split and clusters are searched within each batch). TODO(miku): sort out sharding bug.
## Cluster
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 55b8ef6..1a0fb95 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -149,7 +149,6 @@ class GroupVerifier:
print(json.dumps(dict(self.counter)))
-
def compare(self, a, b):
"""
We compare two release entities here.
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index a010c04..f673206 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1,7 +1,7 @@
import pytest
import collections
import tempfile
-from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by
+from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis
import os
Case = collections.namedtuple("Case", 'input output')
@@ -100,53 +100,3 @@ def test_release_key_title_nysiis():
for case in cases:
assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
case.input)
-
-
-def test_release_key_title_authors_ngram():
- pass
-
-
-def test_sort_by_column():
- with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
- tf.write("2 b\n")
- tf.write("2 a\n")
- tf.write("9 d\n")
- tf.write("1 c\n")
-
- fn = sort_by_column(tf.name, opts='-k 2')
- with open(fn) as f:
- lines = [v.strip() for v in f]
- assert lines == ['2 a', '2 b', '1 c', '9 d']
-
- fn = sort_by_column(tf.name, opts='-k 1')
- with open(fn) as f:
- lines = [v.strip() for v in f]
- assert lines == ['1 c', '2 a', '2 b', '9 d']
-
- fn = sort_by_column(tf.name, opts='-k 3')
- with open(fn) as f:
- lines = [v.strip() for v in f]
- assert lines == ['1 c', '2 a', '2 b', '9 d']
-
-
-def test_group_by():
- Case = collections.namedtuple("Case", "seq keyfunc valuefunc result")
- cases = (
- Case(["0", "1"], lambda v: v, lambda v: v, [{
- 'k': '0',
- 'v': ['0']
- }, {
- 'k': '1',
- 'v': ['1']
- }]),
- Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{
- 'k': 'a',
- 'v': ['1', '2']
- }, {
- 'k': 'b',
- 'v': ['3']
- }]),
- )
-
- for case in cases:
- assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc))