diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-11-05 17:19:07 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-11-16 18:58:42 +0100 |
commit | 0c84af603894049dd8edd95da18d8990ab0516d1 (patch) | |
tree | 08fb4ad2b3a498e2edac73972f97e427e0194759 /tests | |
parent | 282f315c6ba3643c8c614220ab2f7e1d55de3658 (diff) | |
download | fuzzycat-0c84af603894049dd8edd95da18d8990ab0516d1.tar.gz fuzzycat-0c84af603894049dd8edd95da18d8990ab0516d1.zip |
turn "match_release_fuzzy" into a class
Goal of this refactoring was to make the matching process a bit more
configurable by using a class and a cascade of queries.
For a limited test set: `FuzzyReleaseMatcher.match` is works the same as
`match_release_fuzzy`.
Diffstat (limited to 'tests')
16 files changed, 323 insertions, 12 deletions
diff --git a/tests/files/README.md b/tests/files/README.md new file mode 100644 index 0000000..ef674d6 --- /dev/null +++ b/tests/files/README.md @@ -0,0 +1,5 @@ +# Matcher Test Files + +The goal here is to have a mostly language-independent test cases for matching. + +Each subdirectory corresponds to a test function and contains examples for it. diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml new file mode 100644 index 0000000..2df8d9a --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml @@ -0,0 +1,13 @@ +title: titles are case insensitive +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml new file mode 100644 index 0000000..1070408 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml @@ -0,0 +1,13 @@ +title: another vanilla query +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Poul-Henning Kamp" + } + ], + "title": "The hyperdimensional tar pit", + "ext_ids": {} + } +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml new file mode 100644 index 0000000..882e746 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml @@ -0,0 +1,16 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Maurice Florence" + }, + { + "raw_name": "Tuomo Tiisala" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml new file mode 100644 index 0000000..0a2ad12 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml @@ -0,0 +1,16 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "Maurice Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml new file mode 100644 index 0000000..36ea0fe --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml @@ -0,0 +1,16 @@ +title: short version of name should not work +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "M. Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml new file mode 100644 index 0000000..07230e8 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml @@ -0,0 +1,14 @@ +title: titles are case insensitive +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml new file mode 100644 index 0000000..62e9586 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml @@ -0,0 +1,14 @@ +title: another vanilla query +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Poul-Henning Kamp" + } + ], + "title": "The hyperdimensional tar pit", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml new file mode 100644 index 0000000..b89e825 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml @@ -0,0 +1,17 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Maurice Florence" + }, + { + "raw_name": "Tuomo Tiisala" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml new file mode 100644 index 0000000..3de7262 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml @@ -0,0 +1,17 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "Maurice Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml new file mode 100644 index 0000000..39fb065 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml @@ -0,0 +1,17 @@ +title: short version of name should not work +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "M. Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml new file mode 100644 index 0000000..fff19fa --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml @@ -0,0 +1,17 @@ +title: here, Iz Beltagy is missing from author, but still retrieved +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + }, + { + "raw_name": "Kyle Lo" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 3 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml new file mode 100644 index 0000000..d4e0025 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml @@ -0,0 +1,14 @@ +title: here, 2/3 authors are missing, we fail with jaccard index 0.5 +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml new file mode 100644 index 0000000..23d5a8d --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml @@ -0,0 +1,17 @@ +title: match, despite trailing whitespace +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + }, + { + "raw_name": "Kyle Lo" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text ", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 3 diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py index cf71f91..f36f9a4 100644 --- a/tests/test_grobid_unstructured.py +++ b/tests/test_grobid_unstructured.py @@ -18,11 +18,7 @@ def test_grobid_ref_to_release(): given_name='ahab', surname='sailor', ), - GrobidAuthor( - full_name='mary jane', - given_name='mary', - surname='jane' - ), + GrobidAuthor(full_name='mary jane', given_name='mary', surname='jane'), ], ) r = grobid_ref_to_release(d) diff --git a/tests/test_matching.py b/tests/test_matching.py index ad971a5..ca94c2a 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -1,13 +1,14 @@ +import collections import logging import warnings import elasticsearch import pytest import requests -from fatcat_openapi_client import ReleaseEntity +from fatcat_openapi_client import ReleaseEntity, ReleaseContrib -from fuzzycat.entities import entity_from_dict -from fuzzycat.matching import anything_to_entity, match_release_fuzzy +from fuzzycat.entities import entity_from_dict, entity_from_json +from fuzzycat.matching import anything_to_entity, match_release_fuzzy, FuzzyReleaseMatcher warnings.filterwarnings( "ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ... @@ -18,6 +19,9 @@ from fatcat_openapi_client import ReleaseEntity import pytest import elasticsearch import logging +import yaml +import glob +import json logger = logging.getLogger('test_matching') logger.setLevel(logging.DEBUG) @@ -40,19 +44,35 @@ def is_reachable(url, timeout=3): return False +def yaml_to_cases(klass, + files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"): + """ + Turn yaml files into a collection of named tuple test cases. The glob is + relative to the project root (i.e. where you usually run `pytest` from). + """ + cases = [] + for path in glob.glob(files): + with open(path) as f: + doc = yaml.load(f, Loader=yaml.Loader) + cases.append(klass(**doc)) + return cases + + @pytest.fixture def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -@pytest.mark.skipif( - is_not_reachable(FATCAT_SEARCH_URL), - reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override". - format(FATCAT_SEARCH_URL)) +# @pytest.mark.skipif( +# is_not_reachable(FATCAT_SEARCH_URL), +# reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override". +# format(FATCAT_SEARCH_URL)) def test_match_release_fuzzy(es_client, caplog): """ This test is tied to the current index contents, so if that changes, this test may fail as well. + + Note: Deprecated. We want to get rid of this. """ cases = ( ("wtv64ahbdzgwnan7rllwr3nurm", 1), @@ -106,3 +126,92 @@ def test_match_release_fuzzy(es_client, caplog): logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), [v.title for v in result])) assert len(result) == count, doc + + +def test_matcher_match_release(es_client, caplog): + cases = ( + ("wtv64ahbdzgwnan7rllwr3nurm", 1), + ("eqcgtpav3na5jh56o5vjsvb4ei", 1), + ) + matcher = FuzzyReleaseMatcher(es=es_client, size=5) + for i, (ident, count) in enumerate(cases): + entity = anything_to_entity(ident, ReleaseEntity) + result = matcher.match(entity) + logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) + assert len(result) == count + + # Partial data. + cases = ( + ({ + "title": "digital libraries", + "ext_ids": {} + }, 5), + ({ + "title": "unlikelytitle", + "ext_ids": {} + }, 0), + ({ + "title": "Imminent dystopia", + "ext_ids": {} + }, 5), + ({ + "title": "", + "contribs": [{ + "raw_name": "Aristoteles" + }], + "ext_ids": {} + }, 5), + # ({ + # "title": "Letter", + # "contribs": [{"raw_name": "Claudel"}], + # "ext_ids": {} + # }, 1), + # ({ + # "title": "The Future of Digital Scholarship", + # "contribs": [{ + # "raw_name": "Costantino Thanos" + # }], + # "ext_ids": {} + # }, 5), + ) + for i, (doc, count) in enumerate(cases): + entity = entity_from_dict(doc, ReleaseEntity) + result = matcher.match(entity) + with caplog.at_level(logging.INFO): + logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), + [v.title for v in result])) + assert len(result) == count, doc + + +def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog): + matcher = FuzzyReleaseMatcher(es=es_client) + cases = ( + ("wtv64ahbdzgwnan7rllwr3nurm", 1), + ("eqcgtpav3na5jh56o5vjsvb4ei", 1), + ) + for i, (ident, count) in enumerate(cases): + entity = anything_to_entity(ident, ReleaseEntity) + result = matcher.match_release_by_id(entity) + assert len(result) == count + + +def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog): + matcher = FuzzyReleaseMatcher(es=es_client) + Case = collections.namedtuple("Case", "title date input expected") + cases = yaml_to_cases( + Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml") + for i, c in enumerate(cases): + entity = entity_from_json(c.input, ReleaseEntity) + result = matcher.match_release_exact_title_exact_contrib(entity) + assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) + + +def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog): + matcher = FuzzyReleaseMatcher(es=es_client) + Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected") + cases = yaml_to_cases( + Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml") + for i, c in enumerate(cases): + entity = entity_from_json(c.input, ReleaseEntity) + result = matcher.match_release_exact_title_partial_contrib(entity) + assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) |