start larger refactoring: remove cluster

background: verifying hundreds of millions of documents turned out to be a bit slow; anecdata: running clustering and verification over 1.8B inputs tooks over 50h; cf. the Go port (skate) required about 2-4h for those operations. Also: with Go we do not need the extra GNU parallel wrapping. In any case, we aim for fuzzycat refactoring to provide: * better, more configurable verification and small scale matching * removal of batch clustering code (and improve refcat docs) * a place for a bit more generic, similarity based utils The most important piece in fuzzycat is a CSV file containing hand picked test examples for verification - and the code that is able to fulfill that test suite. We want to make this part more robust.
author: Martin Czygan <martin.czygan@gmail.com> 2021-09-24 13:58:51 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-09-24 13:58:51 +0200
commit: 478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
tree: fa467290e8c8df41a1e97a6de751d0f7e790c9de /tests
parent: 86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
download: fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz
fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip
3 files changed, 12 insertions, 199 deletions
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
deleted file mode 100644
index 55b349a..0000000
--- a/tests/test_cluster.py
+++ /dev/null
@@ -1,189 +0,0 @@
-import collections
-import io
-import json
-import os
-import tempfile
-
-import pytest
-
-from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
-                              release_key_title_nysiis)
-
-Case = collections.namedtuple("Case", 'input output')
-
-
-def test_release_key_title():
-    with pytest.raises(KeyError):
-        release_key_title({})
-    with pytest.raises(KeyError, match='title'):
-        release_key_title({'ident': '123'})
-    with pytest.raises(KeyError, match='ident'):
-        release_key_title({'title': 'deep learning backdoor'})
-    with pytest.raises(ValueError, match='title.*missing'):
-        release_key_title({'ident': '', 'title': ''})
-    cases = (
-        Case(input={
-            'ident': '',
-            'title': 'simhash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Simhash'
-        }, output=('', 'Simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Sim  hash'
-        }, output=('', 'Sim  hash')),
-    )
-    for case in cases:
-        assert case.output == release_key_title(case.input)
-
-
-def test_release_key_title_normalized():
-    cases = (
-        Case(input={
-            'ident': '',
-            'title': 'simhash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Simhash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'Sim  hash'
-        }, output=('', 'simhash')),
-        Case(input={
-            'ident': '',
-            'title': 'THE year 1929'
-        }, output=('', 'theyear1929')),
-        Case(input={
-            'ident': '',
-            'title': '2019?'
-        }, output=('', '2019')),
-        Case(input={
-            'ident': '123',
-            'title': 'H~~2019?'
-        }, output=('123', 'h2019')),
-    )
-    for case in cases:
-        assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(
-            case.input)
-
-
-def test_release_key_title_nysiis():
-    cases = (
-        Case(input={
-            'ident': '',
-            'title': 'simhash'
-        }, output=('', 'SANM')),
-        Case(input={
-            'ident': '',
-            'title': 'Simhash'
-        }, output=('', 'SANM')),
-        Case(input={
-            'ident': '',
-            'title': 'Sim  hash'
-        }, output=('', 'SAN')),
-        Case(input={
-            'ident': '',
-            'title': 'THE year 1929'
-        }, output=('', 'T')),
-        Case(input={
-            'ident': '',
-            'title': '2019?'
-        }, output=('', '2019?')),
-        Case(input={
-            'ident': '123',
-            'title': 'H~~2019?'
-        }, output=('123', 'H~2019?')),
-        Case(input={
-            'ident': '123',
-            'title': '世界'
-        }, output=('123', '世界')),
-    )
-    for case in cases:
-        assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
-            case.input)
-
-
-def test_cluster():
-    sio = io.StringIO()
-    lines = [
-        json.dumps(doc) for doc in [
-            {
-                "title": "hello world",
-                "ident": 1,
-            },
-            {
-                "title": "hello world!",
-                "ident": 2,
-            },
-        ]
-    ]
-    cluster = Cluster(lines, release_key_title_normalized, output=sio)
-    stats = cluster.run()
-    assert stats == {
-        "key_fail": 0,
-        "key_ok": 2,
-        "key_empty": 0,
-        "key_denylist": 0,
-        "num_clusters": 1
-    }
-    assert json.loads(sio.getvalue()) == {
-        "k": "helloworld",
-        "v": [{
-            "title": "hello world!",
-            "ident": 2
-        }, {
-            "title": "hello world",
-            "ident": 1
-        }]
-    }
-
-    sio = io.StringIO()
-    cluster = Cluster([
-        json.dumps(line) for line in [
-            {
-                "title": "hello world",
-                "ident": 1
-            },
-            {
-                "title": "hello world!",
-                "ident": 2
-            },
-            {
-                "title": "other",
-                "ident": 3
-            },
-        ]
-    ],
-                      release_key_title_normalized,
-                      min_cluster_size=1,
-                      output=sio)
-    stats = cluster.run()
-    assert stats == {
-        "key_fail": 0,
-        "key_ok": 3,
-        "key_empty": 0,
-        "key_denylist": 0,
-        "num_clusters": 2
-    }
-    assert [json.loads(line) for line in sio.getvalue().split("\n") if line] == [{
-        "k":
-        "helloworld",
-        "v": [{
-            "title": "hello world!",
-            "ident": 2
-        }, {
-            "title": "hello world",
-            "ident": 1
-        }]
-    }, {
-        'k':
-        'other',
-        'v': [{
-            'ident': 3,
-            'title': 'other'
-        }]
-    }]
diff --git a/tests/test_matching.py b/tests/test_matching.py
index 90d1fee..ad971a5 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -81,7 +81,9 @@ def test_match_release_fuzzy(es_client, caplog):
         }, 2),
         ({
             "title": "",
-            "contribs": [{"raw_name": "Aristoteles"}],
+            "contribs": [{
+                "raw_name": "Aristoteles"
+            }],
             "ext_ids": {}
         }, 5),
         # ({
@@ -102,5 +104,5 @@ def test_match_release_fuzzy(es_client, caplog):
         result = match_release_fuzzy(entity, es=es_client)
         with caplog.at_level(logging.INFO):
             logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
-                                                              [v.title for v in result]))
+                                                                      [v.title for v in result]))
         assert len(result) == count, doc
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 957203f..b2242b8 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,7 +2,7 @@ import pytest
 import os
 
 from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
-                            token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
+                            token_n_grams, tokenize_string, parse_page_string, dict_has_key,
                             zstdlines, es_compat_hits_total, clean_doi)
 
 
@@ -67,13 +67,13 @@ def test_nwise():
     assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
 
 
-def test_dict_key_exists():
-    assert dict_key_exists({}, "") is False
-    assert dict_key_exists({"a": "a"}, "a") == True
-    assert dict_key_exists({"a": "a"}, "b") == False
-    assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
-    assert dict_key_exists({"a": {"b": None}}, "a.b") == True
-    assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
+def test_dict_has_key():
+    assert dict_has_key({}, "") is False
+    assert dict_has_key({"a": "a"}, "a") == True
+    assert dict_has_key({"a": "a"}, "b") == False
+    assert dict_has_key({"a": {"b": "c"}}, "a.b") == True
+    assert dict_has_key({"a": {"b": None}}, "a.b") == True
+    assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False
 
 
 def test_page_page_string():
author	Martin Czygan <martin.czygan@gmail.com>	2021-09-24 13:58:51 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-09-24 13:58:51 +0200
commit	478d7d06ad9e56145cb94f3461c355b1ba9eb491 (patch)
tree	fa467290e8c8df41a1e97a6de751d0f7e790c9de /tests
parent	86cc3191ce03042ef4a0c6c8a44f4094a140b802 (diff)
download	fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.tar.gz fuzzycat-478d7d06ad9e56145cb94f3461c355b1ba9eb491.zip