aboutsummaryrefslogtreecommitdiffstats
path: root/tests/test_matching.py
diff options
context:
space:
mode:
Diffstat (limited to 'tests/test_matching.py')
-rw-r--r--tests/test_matching.py94
1 files changed, 10 insertions, 84 deletions
diff --git a/tests/test_matching.py b/tests/test_matching.py
index a7754ee..b9d7fae 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -48,90 +48,16 @@ def es_client():
return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
-def test_matcher_match_release(es_client, caplog):
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- matcher = FuzzyReleaseMatcher(es=es_client, size=5)
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
- result = matcher.match(entity)
- logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
- assert len(result) == count
-
- # Partial data.
- cases = (
- ({
- "title": "digital libraries",
- "ext_ids": {}
- }, 5),
- ({
- "title": "unlikelytitle",
- "ext_ids": {}
- }, 0),
- ({
- "title": "Imminent dystopia",
- "ext_ids": {}
- }, 5),
- ({
- "title": "",
- "contribs": [{
- "raw_name": "Aristoteles"
- }],
- "ext_ids": {}
- }, 5),
- # ({
- # "title": "Letter",
- # "contribs": [{"raw_name": "Claudel"}],
- # "ext_ids": {}
- # }, 1),
- # ({
- # "title": "The Future of Digital Scholarship",
- # "contribs": [{
- # "raw_name": "Costantino Thanos"
- # }],
- # "ext_ids": {}
- # }, 5),
- )
- for i, (doc, count) in enumerate(cases):
- entity = entity_from_dict(doc, ReleaseEntity)
- result = matcher.match(entity)
- with caplog.at_level(logging.INFO):
- logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
- [v.title for v in result]))
- assert len(result) == count, doc
-
-
-def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog):
- matcher = FuzzyReleaseMatcher(es=es_client)
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
- result = matcher.match_release_by_id(entity)
- assert len(result) == count
-
-
-def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog):
- matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "title date input expected")
- cases = yaml_to_cases(
- Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml")
- for i, c in enumerate(cases):
- entity = entity_from_json(c.input, ReleaseEntity)
- result = matcher.match_release_exact_title_exact_contrib(entity)
- assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
-
-
-def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog):
+def test_simple_fuzzy_release_matcher(es_client, caplog):
+ """
+ Use a single test function to test the higher level match function. We want
+ the result to be sensible, but should also document broken examples here.
+ """
matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected")
- cases = yaml_to_cases(
- Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml")
+ Case = collections.namedtuple("Case", "about input release_year_padding expected")
+ cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml")
for i, c in enumerate(cases):
+ matcher.release_year_padding = c.release_year_padding
entity = entity_from_json(c.input, ReleaseEntity)
- result = matcher.match_release_exact_title_partial_contrib(entity)
- assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
+ result = matcher.match(entity)
+ assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input)