complete FuzzyReleaseMatcher refactoring

We keep the name, since the api - "matcher.match(release)" - is the same; simplified queries; at most one query is performed against elasticsearch; parallel release retrieval from the API; optional support for release year windows; Test cases are expressed in yaml and will be auto-loaded from the specified directory; test work against the current search endpoint, which means the actual output may change on index updates; for the moment, we think this setup is relatively simple and not too unstable. about: title contrib, partial name input: > { "contribs": [ { "raw_name": "Adams" } ], "title": "digital libraries", "ext_ids": {} } release_year_padding: 1 expected: - 7rmvqtrb2jdyhcxxodihzzcugy - a2u6ougtsjcbvczou6sazsulcm - dy45vilej5diros6zmax46nm4e - exuwhhayird4fdjmmsiqpponlq - gqrj7jikezgcfpjfazhpf4e7c4 - mkmqt3453relbpuyktnmsg6hjq - t2g5sl3dgzchtnq7dynxyzje44 - t4tvenhrvzamraxrvvxivxmvga - wd3oeoi3bffknfbg2ymleqc4ja - y63a6dhrfnb7bltlxfynydbojy
author: Martin Czygan <martin.czygan@gmail.com> 2021-11-17 14:51:50 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2021-12-06 19:53:30 +0100
commit: dd6149140542585f2b0bfc3b334ec2b0a88b790e (patch)
tree: 6a11c228558cfbf73932bc828cda9be3735cfd78 /tests/test_matching.py
parent: d104f8d0ba8eef5563555de82be66bbf17f961db (diff)
download: fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.tar.gz
fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.zip
1 files changed, 10 insertions, 84 deletions
diff --git a/tests/test_matching.py b/tests/test_matching.py
index a7754ee..b9d7fae 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -48,90 +48,16 @@ def es_client():
     return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
 
 
-def test_matcher_match_release(es_client, caplog):
-    cases = (
-        ("wtv64ahbdzgwnan7rllwr3nurm", 1),
-        ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
-    )
-    matcher = FuzzyReleaseMatcher(es=es_client, size=5)
-    for i, (ident, count) in enumerate(cases):
-        entity = anything_to_entity(ident, ReleaseEntity)
-        result = matcher.match(entity)
-        logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
-        assert len(result) == count
-
-    # Partial data.
-    cases = (
-        ({
-            "title": "digital libraries",
-            "ext_ids": {}
-        }, 5),
-        ({
-            "title": "unlikelytitle",
-            "ext_ids": {}
-        }, 0),
-        ({
-            "title": "Imminent dystopia",
-            "ext_ids": {}
-        }, 5),
-        ({
-            "title": "",
-            "contribs": [{
-                "raw_name": "Aristoteles"
-            }],
-            "ext_ids": {}
-        }, 5),
-        # ({
-        #     "title": "Letter",
-        #     "contribs": [{"raw_name": "Claudel"}],
-        #     "ext_ids": {}
-        # }, 1),
-        # ({
-        #     "title": "The Future of Digital Scholarship",
-        #     "contribs": [{
-        #         "raw_name": "Costantino Thanos"
-        #     }],
-        #     "ext_ids": {}
-        # }, 5),
-    )
-    for i, (doc, count) in enumerate(cases):
-        entity = entity_from_dict(doc, ReleaseEntity)
-        result = matcher.match(entity)
-        with caplog.at_level(logging.INFO):
-            logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
-                                                                      [v.title for v in result]))
-        assert len(result) == count, doc
-
-
-def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog):
-    matcher = FuzzyReleaseMatcher(es=es_client)
-    cases = (
-        ("wtv64ahbdzgwnan7rllwr3nurm", 1),
-        ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
-    )
-    for i, (ident, count) in enumerate(cases):
-        entity = anything_to_entity(ident, ReleaseEntity)
-        result = matcher.match_release_by_id(entity)
-        assert len(result) == count
-
-
-def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog):
-    matcher = FuzzyReleaseMatcher(es=es_client)
-    Case = collections.namedtuple("Case", "title date input expected")
-    cases = yaml_to_cases(
-        Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml")
-    for i, c in enumerate(cases):
-        entity = entity_from_json(c.input, ReleaseEntity)
-        result = matcher.match_release_exact_title_exact_contrib(entity)
-        assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
-
-
-def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog):
+def test_simple_fuzzy_release_matcher(es_client, caplog):
+    """
+    Use a single test function to test the higher level match function. We want
+    the result to be sensible, but should also document broken examples here.
+    """
     matcher = FuzzyReleaseMatcher(es=es_client)
-    Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected")
-    cases = yaml_to_cases(
-        Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml")
+    Case = collections.namedtuple("Case", "about input release_year_padding expected")
+    cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml")
     for i, c in enumerate(cases):
+        matcher.release_year_padding = c.release_year_padding
         entity = entity_from_json(c.input, ReleaseEntity)
-        result = matcher.match_release_exact_title_partial_contrib(entity)
-        assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
+        result = matcher.match(entity)
+        assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input)
author	Martin Czygan <martin.czygan@gmail.com>	2021-11-17 14:51:50 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2021-12-06 19:53:30 +0100
commit	dd6149140542585f2b0bfc3b334ec2b0a88b790e (patch)
tree	6a11c228558cfbf73932bc828cda9be3735cfd78 /tests/test_matching.py
parent	d104f8d0ba8eef5563555de82be66bbf17f961db (diff)
download	fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.tar.gz fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.zip