aboutsummaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-24 13:58:51 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-24 13:58:51 +0200
commit478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
treefa467290e8c8df41a1e97a6de751d0f7e790c9de /tests
parent86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
downloadfuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz
fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip
start larger refactoring: remove cluster
background: verifying hundreds of millions of documents turned out to be a bit slow; anecdata: running clustering and verification over 1.8B inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for those operations. Also: with Go we do not need the extra GNU parallel wrapping. In any case, we aim for fuzzycat refactoring to provide: * better, more configurable verification and small scale matching * removal of batch clustering code (and improve refcat docs) * a place for a bit more generic, similarity based utils The most important piece in fuzzycat is a CSV file containing hand picked test examples for verification - and the code that is able to fulfill that test suite. We want to make this part more robust.
Diffstat (limited to 'tests')
-rw-r--r--tests/test_cluster.py189
-rw-r--r--tests/test_matching.py6
-rw-r--r--tests/test_utils.py16
3 files changed, 12 insertions, 199 deletions
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
deleted file mode 100644
index 55b349a..0000000
--- a/tests/test_cluster.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import collections
-import io
-import json
-import os
-import tempfile
-
-import pytest
-
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
- release_key_title_nysiis)
-
-Case = collections.namedtuple("Case", 'input output')
-
-
-def test_release_key_title():
- with pytest.raises(KeyError):
- release_key_title({})
- with pytest.raises(KeyError, match='title'):
- release_key_title({'ident': '123'})
- with pytest.raises(KeyError, match='ident'):
- release_key_title({'title': 'deep learning backdoor'})
- with pytest.raises(ValueError, match='title.*missing'):
- release_key_title({'ident': '', 'title': ''})
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'Simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'Sim hash')),
- )
- for case in cases:
- assert case.output == release_key_title(case.input)
-
-
-def test_release_key_title_normalized():
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'simhash')),
- Case(input={
- 'ident': '',
- 'title': 'THE year 1929'
- }, output=('', 'theyear1929')),
- Case(input={
- 'ident': '',
- 'title': '2019?'
- }, output=('', '2019')),
- Case(input={
- 'ident': '123',
- 'title': 'H~~2019?'
- }, output=('123', 'h2019')),
- )
- for case in cases:
- assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(
- case.input)
-
-
-def test_release_key_title_nysiis():
- cases = (
- Case(input={
- 'ident': '',
- 'title': 'simhash'
- }, output=('', 'SANM')),
- Case(input={
- 'ident': '',
- 'title': 'Simhash'
- }, output=('', 'SANM')),
- Case(input={
- 'ident': '',
- 'title': 'Sim hash'
- }, output=('', 'SAN')),
- Case(input={
- 'ident': '',
- 'title': 'THE year 1929'
- }, output=('', 'T')),
- Case(input={
- 'ident': '',
- 'title': '2019?'
- }, output=('', '2019?')),
- Case(input={
- 'ident': '123',
- 'title': 'H~~2019?'
- }, output=('123', 'H~2019?')),
- Case(input={
- 'ident': '123',
- 'title': '世界'
- }, output=('123', '世界')),
- )
- for case in cases:
- assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
- case.input)
-
-
-def test_cluster():
- sio = io.StringIO()
- lines = [
- json.dumps(doc) for doc in [
- {
- "title": "hello world",
- "ident": 1,
- },
- {
- "title": "hello world!",
- "ident": 2,
- },
- ]
- ]
- cluster = Cluster(lines, release_key_title_normalized, output=sio)
- stats = cluster.run()
- assert stats == {
- "key_fail": 0,
- "key_ok": 2,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 1
- }
- assert json.loads(sio.getvalue()) == {
- "k": "helloworld",
- "v": [{
- "title": "hello world!",
- "ident": 2
- }, {
- "title": "hello world",
- "ident": 1
- }]
- }
-
- sio = io.StringIO()
- cluster = Cluster([
- json.dumps(line) for line in [
- {
- "title": "hello world",
- "ident": 1
- },
- {
- "title": "hello world!",
- "ident": 2
- },
- {
- "title": "other",
- "ident": 3
- },
- ]
- ],
- release_key_title_normalized,
- min_cluster_size=1,
- output=sio)
- stats = cluster.run()
- assert stats == {
- "key_fail": 0,
- "key_ok": 3,
- "key_empty": 0,
- "key_denylist": 0,
- "num_clusters": 2
- }
- assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
- "k":
- "helloworld",
- "v": [{
- "title": "hello world!",
- "ident": 2
- }, {
- "title": "hello world",
- "ident": 1
- }]
- }, {
- 'k':
- 'other',
- 'v': [{
- 'ident': 3,
- 'title': 'other'
- }]
- }]
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 90d1fee..ad971a5 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -81,7 +81,9 @@ def test_match_release_fuzzy(es_client, caplog):
}, 2),
({
"title": "",
- "contribs": [{"raw_name": "Aristoteles"}],
+ "contribs": [{
+ "raw_name": "Aristoteles"
+ }],
"ext_ids": {}
}, 5),
# ({
@@ -102,5 +104,5 @@ def test_match_release_fuzzy(es_client, caplog):
result = match_release_fuzzy(entity, es=es_client)
with caplog.at_level(logging.INFO):
logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
- [v.title for v in result]))
+ [v.title for v in result]))
assert len(result) == count, doc
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 957203f..b2242b8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,7 @@ import pytest
import os
from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
- token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
+ token_n_grams, tokenize_string, parse_page_string, dict_has_key,
zstdlines, es_compat_hits_total, clean_doi)
@@ -67,13 +67,13 @@ def test_nwise():
assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
-def test_dict_key_exists():
- assert dict_key_exists({}, "") is False
- assert dict_key_exists({"a": "a"}, "a") == True
- assert dict_key_exists({"a": "a"}, "b") == False
- assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
- assert dict_key_exists({"a": {"b": None}}, "a.b") == True
- assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
+def test_dict_has_key():
+ assert dict_has_key({}, "") is False
+ assert dict_has_key({"a": "a"}, "a") == True
+ assert dict_has_key({"a": "a"}, "b") == False
+ assert dict_has_key({"a": {"b": "c"}}, "a.b") == True
+ assert dict_has_key({"a": {"b": None}}, "a.b") == True
+ assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False
def test_page_page_string():