diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-09-24 13:58:51 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-09-24 13:58:51 +0200 |
commit | 478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch) | |
tree | fa467290e8c8df41a1e97a6de751d0f7e790c9de /tests | |
parent | 86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff) | |
download | fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip |
start larger refactoring: remove cluster
background: verifying hundreds of millions of documents turned out to be
a bit slow; anecdata: running clustering and verification over 1.8B
inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for
those operations. Also: with Go we do not need the extra GNU parallel
wrapping.
In any case, we aim for fuzzycat refactoring to provide:
* better, more configurable verification and small scale matching
* removal of batch clustering code (and improve refcat docs)
* a place for a bit more generic, similarity based utils
The most important piece in fuzzycat is a CSV file containing hand
picked test examples for verification - and the code that is able to
fulfill that test suite. We want to make this part more robust.
Diffstat (limited to 'tests')
-rw-r--r-- | tests/test_cluster.py | 189 | ||||
-rw-r--r-- | tests/test_matching.py | 6 | ||||
-rw-r--r-- | tests/test_utils.py | 16 |
3 files changed, 12 insertions, 199 deletions
diff --git a/tests/test_cluster.py b/tests/test_cluster.py deleted file mode 100644 index 55b349a..0000000 --- a/tests/test_cluster.py +++ /dev/null @@ -1,189 +0,0 @@ -import collections -import io -import json -import os -import tempfile - -import pytest - -from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized, - release_key_title_nysiis) - -Case = collections.namedtuple("Case", 'input output') - - -def test_release_key_title(): - with pytest.raises(KeyError): - release_key_title({}) - with pytest.raises(KeyError, match='title'): - release_key_title({'ident': '123'}) - with pytest.raises(KeyError, match='ident'): - release_key_title({'title': 'deep learning backdoor'}) - with pytest.raises(ValueError, match='title.*missing'): - release_key_title({'ident': '', 'title': ''}) - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'Simhash')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'Sim hash')), - ) - for case in cases: - assert case.output == release_key_title(case.input) - - -def test_release_key_title_normalized(): - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'simhash')), - Case(input={ - 'ident': '', - 'title': 'THE year 1929' - }, output=('', 'theyear1929')), - Case(input={ - 'ident': '', - 'title': '2019?' - }, output=('', '2019')), - Case(input={ - 'ident': '123', - 'title': 'H~~2019?' - }, output=('123', 'h2019')), - ) - for case in cases: - assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format( - case.input) - - -def test_release_key_title_nysiis(): - cases = ( - Case(input={ - 'ident': '', - 'title': 'simhash' - }, output=('', 'SANM')), - Case(input={ - 'ident': '', - 'title': 'Simhash' - }, output=('', 'SANM')), - Case(input={ - 'ident': '', - 'title': 'Sim hash' - }, output=('', 'SAN')), - Case(input={ - 'ident': '', - 'title': 'THE year 1929' - }, output=('', 'T')), - Case(input={ - 'ident': '', - 'title': '2019?' - }, output=('', '2019?')), - Case(input={ - 'ident': '123', - 'title': 'H~~2019?' - }, output=('123', 'H~2019?')), - Case(input={ - 'ident': '123', - 'title': '世界' - }, output=('123', '世界')), - ) - for case in cases: - assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( - case.input) - - -def test_cluster(): - sio = io.StringIO() - lines = [ - json.dumps(doc) for doc in [ - { - "title": "hello world", - "ident": 1, - }, - { - "title": "hello world!", - "ident": 2, - }, - ] - ] - cluster = Cluster(lines, release_key_title_normalized, output=sio) - stats = cluster.run() - assert stats == { - "key_fail": 0, - "key_ok": 2, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 1 - } - assert json.loads(sio.getvalue()) == { - "k": "helloworld", - "v": [{ - "title": "hello world!", - "ident": 2 - }, { - "title": "hello world", - "ident": 1 - }] - } - - sio = io.StringIO() - cluster = Cluster([ - json.dumps(line) for line in [ - { - "title": "hello world", - "ident": 1 - }, - { - "title": "hello world!", - "ident": 2 - }, - { - "title": "other", - "ident": 3 - }, - ] - ], - release_key_title_normalized, - min_cluster_size=1, - output=sio) - stats = cluster.run() - assert stats == { - "key_fail": 0, - "key_ok": 3, - "key_empty": 0, - "key_denylist": 0, - "num_clusters": 2 - } - assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{ - "k": - "helloworld", - "v": [{ - "title": "hello world!", - "ident": 2 - }, { - "title": "hello world", - "ident": 1 - }] - }, { - 'k': - 'other', - 'v': [{ - 'ident': 3, - 'title': 'other' - }] - }] diff --git a/tests/test_matching.py b/tests/test_matching.py index 90d1fee..ad971a5 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -81,7 +81,9 @@ def test_match_release_fuzzy(es_client, caplog): }, 2), ({ "title": "", - "contribs": [{"raw_name": "Aristoteles"}], + "contribs": [{ + "raw_name": "Aristoteles" + }], "ext_ids": {} }, 5), # ({ @@ -102,5 +104,5 @@ def test_match_release_fuzzy(es_client, caplog): result = match_release_fuzzy(entity, es=es_client) with caplog.at_level(logging.INFO): logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), - [v.title for v in result])) + [v.title for v in result])) assert len(result) == count, doc diff --git a/tests/test_utils.py b/tests/test_utils.py index 957203f..b2242b8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import pytest import os from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, - token_n_grams, tokenize_string, parse_page_string, dict_key_exists, + token_n_grams, tokenize_string, parse_page_string, dict_has_key, zstdlines, es_compat_hits_total, clean_doi) @@ -67,13 +67,13 @@ def test_nwise(): assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)] -def test_dict_key_exists(): - assert dict_key_exists({}, "") is False - assert dict_key_exists({"a": "a"}, "a") == True - assert dict_key_exists({"a": "a"}, "b") == False - assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True - assert dict_key_exists({"a": {"b": None}}, "a.b") == True - assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False +def test_dict_has_key(): + assert dict_has_key({}, "") is False + assert dict_has_key({"a": "a"}, "a") == True + assert dict_has_key({"a": "a"}, "b") == False + assert dict_has_key({"a": {"b": "c"}}, "a.b") == True + assert dict_has_key({"a": {"b": None}}, "a.b") == True + assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False def test_page_page_string(): |