diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-11-17 14:51:50 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-12-06 19:53:30 +0100 |
commit | dd6149140542585f2b0bfc3b334ec2b0a88b790e (patch) | |
tree | 6a11c228558cfbf73932bc828cda9be3735cfd78 /tests/test_matching.py | |
parent | d104f8d0ba8eef5563555de82be66bbf17f961db (diff) | |
download | fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.tar.gz fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.zip |
complete FuzzyReleaseMatcher refactoring
We keep the name, since the api - "matcher.match(release)" - is the
same; simplified queries; at most one query is performed against
elasticsearch; parallel release retrieval from the API; optional support
for release year windows;
Test cases are expressed in yaml and will be auto-loaded from the
specified directory; test work against the current search endpoint,
which means the actual output may change on index updates; for the
moment, we think this setup is relatively simple and not too unstable.
about: title contrib, partial name
input: >
{
"contribs": [
{
"raw_name": "Adams"
}
],
"title": "digital libraries",
"ext_ids": {}
}
release_year_padding: 1
expected:
- 7rmvqtrb2jdyhcxxodihzzcugy
- a2u6ougtsjcbvczou6sazsulcm
- dy45vilej5diros6zmax46nm4e
- exuwhhayird4fdjmmsiqpponlq
- gqrj7jikezgcfpjfazhpf4e7c4
- mkmqt3453relbpuyktnmsg6hjq
- t2g5sl3dgzchtnq7dynxyzje44
- t4tvenhrvzamraxrvvxivxmvga
- wd3oeoi3bffknfbg2ymleqc4ja
- y63a6dhrfnb7bltlxfynydbojy
Diffstat (limited to 'tests/test_matching.py')
-rw-r--r-- | tests/test_matching.py | 94 |
1 files changed, 10 insertions, 84 deletions
diff --git a/tests/test_matching.py b/tests/test_matching.py index a7754ee..b9d7fae 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -48,90 +48,16 @@ def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -def test_matcher_match_release(es_client, caplog): - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - matcher = FuzzyReleaseMatcher(es=es_client, size=5) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - result = matcher.match(entity) - logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) - assert len(result) == count - - # Partial data. - cases = ( - ({ - "title": "digital libraries", - "ext_ids": {} - }, 5), - ({ - "title": "unlikelytitle", - "ext_ids": {} - }, 0), - ({ - "title": "Imminent dystopia", - "ext_ids": {} - }, 5), - ({ - "title": "", - "contribs": [{ - "raw_name": "Aristoteles" - }], - "ext_ids": {} - }, 5), - # ({ - # "title": "Letter", - # "contribs": [{"raw_name": "Claudel"}], - # "ext_ids": {} - # }, 1), - # ({ - # "title": "The Future of Digital Scholarship", - # "contribs": [{ - # "raw_name": "Costantino Thanos" - # }], - # "ext_ids": {} - # }, 5), - ) - for i, (doc, count) in enumerate(cases): - entity = entity_from_dict(doc, ReleaseEntity) - result = matcher.match(entity) - with caplog.at_level(logging.INFO): - logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), - [v.title for v in result])) - assert len(result) == count, doc - - -def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog): - matcher = FuzzyReleaseMatcher(es=es_client) - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - result = matcher.match_release_by_id(entity) - assert len(result) == count - - -def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog): - matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "title date input expected") - cases = yaml_to_cases( - Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml") - for i, c in enumerate(cases): - entity = entity_from_json(c.input, ReleaseEntity) - result = matcher.match_release_exact_title_exact_contrib(entity) - assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) - - -def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog): +def test_simple_fuzzy_release_matcher(es_client, caplog): + """ + Use a single test function to test the higher level match function. We want + the result to be sensible, but should also document broken examples here. + """ matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected") - cases = yaml_to_cases( - Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml") + Case = collections.namedtuple("Case", "about input release_year_padding expected") + cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml") for i, c in enumerate(cases): + matcher.release_year_padding = c.release_year_padding entity = entity_from_json(c.input, ReleaseEntity) - result = matcher.match_release_exact_title_partial_contrib(entity) - assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) + result = matcher.match(entity) + assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input) |