From 478d7d06ad9e56145cb94f3461c355b1ba9eb491 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 24 Sep 2021 13:58:51 +0200 Subject: start larger refactoring: remove cluster background: verifying hundreds of millions of documents turned out to be a bit slow; anecdata: running clustering and verification over 1.8B inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for those operations. Also: with Go we do not need the extra GNU parallel wrapping. In any case, we aim for fuzzycat refactoring to provide: * better, more configurable verification and small scale matching * removal of batch clustering code (and improve refcat docs) * a place for a bit more generic, similarity based utils The most important piece in fuzzycat is a CSV file containing hand picked test examples for verification - and the code that is able to fulfill that test suite. We want to make this part more robust. --- tests/test_cluster.py | 189 -------------------------------------------------- 1 file changed, 189 deletions(-) delete mode 100644 tests/test_cluster.py (limited to 'tests/test_cluster.py') diff --git a/tests/test_cluster.py b/tests/test_cluster.py deleted file mode 100644 index 55b349a..0000000 --- a/tests/test_cluster.py +++ /dev/null @@ -1,189 +0,0 @@ -import collections -import io -import json -import os -import tempfile - -import pytest - -from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, - release_key_title_nysiis) - -Case = collections.namedtuple("Case", 'input output') - - -def test_release_key_title(): - with pytest.raises(KeyError): - release_key_title({}) - with pytest.raises(KeyError, match='title'): - release_key_title({'ident': '123'}) - with pytest.raises(KeyError, match='ident'): - release_key_title({'title': 'deep learning backdoor'}) - with pytest.raises(ValueError, match='title.*missing'): - release_key_title({'ident': '', 'title': ''}) - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'Simhash')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'Sim hash')), - ) - for case in cases: - assert case.output == release_key_title(case.input) - - -def test_release_key_title_normalized(): - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'THE year 1929' - }, output=('', 'theyear1929')), - Case(input={ - 'ident': '', - 'title': '2019?' - }, output=('', '2019')), - Case(input={ - 'ident': '123', - 'title': 'H~~2019?' - }, output=('123', 'h2019')), - ) - for case in cases: - assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format( - case.input) - - -def test_release_key_title_nysiis(): - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'SANM')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'SANM')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'SAN')), - Case(input={ - 'ident': '', - 'title': 'THE year 1929' - }, output=('', 'T')), - Case(input={ - 'ident': '', - 'title': '2019?' - }, output=('', '2019?')), - Case(input={ - 'ident': '123', - 'title': 'H~~2019?' - }, output=('123', 'H~2019?')), - Case(input={ - 'ident': '123', - 'title': '世界' - }, output=('123', '世界')), - ) - for case in cases: - assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( - case.input) - - -def test_cluster(): - sio = io.StringIO() - lines = [ - json.dumps(doc) for doc in [ - { - "title": "hello world", - "ident": 1, - }, - { - "title": "hello world!", - "ident": 2, - }, - ] - ] - cluster = Cluster(lines, release_key_title_normalized, output=sio) - stats = cluster.run() - assert stats == { - "key_fail": 0, - "key_ok": 2, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 1 - } - assert json.loads(sio.getvalue()) == { - "k": "helloworld", - "v": [{ - "title": "hello world!", - "ident": 2 - }, { - "title": "hello world", - "ident": 1 - }] - } - - sio = io.StringIO() - cluster = Cluster([ - json.dumps(line) for line in [ - { - "title": "hello world", - "ident": 1 - }, - { - "title": "hello world!", - "ident": 2 - }, - { - "title": "other", - "ident": 3 - }, - ] - ], - release_key_title_normalized, - min_cluster_size=1, - output=sio) - stats = cluster.run() - assert stats == { - "key_fail": 0, - "key_ok": 3, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 2 - } - assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{ - "k": - "helloworld", - "v": [{ - "title": "hello world!", - "ident": 2 - }, { - "title": "hello world", - "ident": 1 - }] - }, { - 'k': - 'other', - 'v': [{ - 'ident': 3, - 'title': 'other' - }] - }] -- cgit v1.2.3