aboutsummaryrefslogtreecommitdiffstats
path: root/tests/test_cluster.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_cluster.py')
-rw-r--r--tests/test_cluster.py189
1 files changed, 0 insertions, 189 deletions
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
deleted file mode 100644
index 55b349a..0000000
--- a/tests/test_cluster.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import collections
-import io
-import json
-import os
-import tempfile
-
-import pytest
-
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
- release_key_title_nysiis)
-
-Case = collections.namedtuple("Case", 'input output')
-
-
-def test_release_key_title():
- with pytest.raises(KeyError):
- release_key_title({})
- with pytest.raises(KeyError, match='title'):
- release_key_title({'ident': '123'})
- with pytest.raises(KeyError, match='ident'):
- release_key_title({'title': 'deep learning backdoor'})
- with pytest.raises(ValueError, match='title.*missing'):
- release_key_title({'ident': '', 'title': ''})
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'Simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'Sim hash')),
- )
- for case in cases:
- assert case.output == release_key_title(case.input)
-
-
-def test_release_key_title_normalized():
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'THE year 1929'
- }, output=('', 'theyear1929')),
- Case(input={
- 'ident': '',
- 'title': '2019?'
- }, output=('', '2019')),
- Case(input={
- 'ident': '123',
- 'title': 'H~~2019?'
- }, output=('123', 'h2019')),
- )
- for case in cases:
- assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(
- case.input)
-
-
-def test_release_key_title_nysiis():
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'SANM')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'SANM')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'SAN')),
- Case(input={
- 'ident': '',
- 'title': 'THE year 1929'
- }, output=('', 'T')),
- Case(input={
- 'ident': '',
- 'title': '2019?'
- }, output=('', '2019?')),
- Case(input={
- 'ident': '123',
- 'title': 'H~~2019?'
- }, output=('123', 'H~2019?')),
- Case(input={
- 'ident': '123',
- 'title': '世界'
- }, output=('123', '世界')),
- )
- for case in cases:
- assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
- case.input)
-
-
-def test_cluster():
- sio = io.StringIO()
- lines = [
- json.dumps(doc) for doc in [
- {
- "title": "hello world",
- "ident": 1,
- },
- {
- "title": "hello world!",
- "ident": 2,
- },
- ]
- ]
- cluster = Cluster(lines, release_key_title_normalized, output=sio)
- stats = cluster.run()
- assert stats == {
- "key_fail": 0,
- "key_ok": 2,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 1
- }
- assert json.loads(sio.getvalue()) == {
- "k": "helloworld",
- "v": [{
- "title": "hello world!",
- "ident": 2
- }, {
- "title": "hello world",
- "ident": 1
- }]
- }
-
- sio = io.StringIO()
- cluster = Cluster([
- json.dumps(line) for line in [
- {
- "title": "hello world",
- "ident": 1
- },
- {
- "title": "hello world!",
- "ident": 2
- },
- {
- "title": "other",
- "ident": 3
- },
- ]
- ],
- release_key_title_normalized,
- min_cluster_size=1,
- output=sio)
- stats = cluster.run()
- assert stats == {
- "key_fail": 0,
- "key_ok": 3,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 2
- }
- assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
- "k":
- "helloworld",
- "v": [{
- "title": "hello world!",
- "ident": 2
- }, {
- "title": "hello world",
- "ident": 1
- }]
- }, {
- 'k':
- 'other',
- 'v': [{
- 'ident': 3,
- 'title': 'other'
- }]
- }]