From 0c84af603894049dd8edd95da18d8990ab0516d1 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 5 Nov 2021 17:19:07 +0100 Subject: turn "match_release_fuzzy" into a class Goal of this refactoring was to make the matching process a bit more configurable by using a class and a cascade of queries. For a limited test set: `FuzzyReleaseMatcher.match` is works the same as `match_release_fuzzy`. --- tests/files/README.md | 5 +++++ .../0.yaml | 13 +++++++++++++ .../1.yaml | 13 +++++++++++++ .../2.yaml | 16 ++++++++++++++++ .../3.yaml | 16 ++++++++++++++++ .../4.yaml | 16 ++++++++++++++++ .../0.yaml | 14 ++++++++++++++ .../1.yaml | 14 ++++++++++++++ .../2.yaml | 17 +++++++++++++++++ .../3.yaml | 17 +++++++++++++++++ .../4.yaml | 17 +++++++++++++++++ .../5.yaml | 17 +++++++++++++++++ .../6.yaml | 14 ++++++++++++++ .../7.yaml | 17 +++++++++++++++++ 14 files changed, 206 insertions(+) create mode 100644 tests/files/README.md create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml create mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml (limited to 'tests/files') diff --git a/tests/files/README.md b/tests/files/README.md new file mode 100644 index 0000000..ef674d6 --- /dev/null +++ b/tests/files/README.md @@ -0,0 +1,5 @@ +# Matcher Test Files + +The goal here is to have a mostly language-independent test cases for matching. + +Each subdirectory corresponds to a test function and contains examples for it. diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml new file mode 100644 index 0000000..2df8d9a --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml @@ -0,0 +1,13 @@ +title: titles are case insensitive +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml new file mode 100644 index 0000000..1070408 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml @@ -0,0 +1,13 @@ +title: another vanilla query +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Poul-Henning Kamp" + } + ], + "title": "The hyperdimensional tar pit", + "ext_ids": {} + } +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml new file mode 100644 index 0000000..882e746 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml @@ -0,0 +1,16 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Maurice Florence" + }, + { + "raw_name": "Tuomo Tiisala" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml new file mode 100644 index 0000000..0a2ad12 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml @@ -0,0 +1,16 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "Maurice Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml new file mode 100644 index 0000000..36ea0fe --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml @@ -0,0 +1,16 @@ +title: short version of name should not work +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "M. Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml new file mode 100644 index 0000000..07230e8 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml @@ -0,0 +1,14 @@ +title: titles are case insensitive +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml new file mode 100644 index 0000000..62e9586 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml @@ -0,0 +1,14 @@ +title: another vanilla query +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Poul-Henning Kamp" + } + ], + "title": "The hyperdimensional tar pit", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml new file mode 100644 index 0000000..b89e825 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml @@ -0,0 +1,17 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Maurice Florence" + }, + { + "raw_name": "Tuomo Tiisala" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml new file mode 100644 index 0000000..3de7262 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml @@ -0,0 +1,17 @@ +title: order of contribs does not matter +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "Maurice Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml new file mode 100644 index 0000000..39fb065 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml @@ -0,0 +1,17 @@ +title: short version of name should not work +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Tuomo Tiisala" + }, + { + "raw_name": "M. Florence" + } + ], + "title": "Foucault", + "ext_ids": {} + } +jaccard_index_threshold: 1.0 +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml new file mode 100644 index 0000000..fff19fa --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml @@ -0,0 +1,17 @@ +title: here, Iz Beltagy is missing from author, but still retrieved +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + }, + { + "raw_name": "Kyle Lo" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 3 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml new file mode 100644 index 0000000..d4e0025 --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml @@ -0,0 +1,14 @@ +title: here, 2/3 authors are missing, we fail with jaccard index 0.5 +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml new file mode 100644 index 0000000..23d5a8d --- /dev/null +++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml @@ -0,0 +1,17 @@ +title: match, despite trailing whitespace +date: 2021-11-08 +input: > + { + "contribs": [ + { + "raw_name": "Arman Cohan" + }, + { + "raw_name": "Kyle Lo" + } + ], + "title": "SciBERT: A Pretrained Language Model for Scientific Text ", + "ext_ids": {} + } +jaccard_index_threshold: 0.5 +expected: 3 -- cgit v1.2.3