aboutsummaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-11-05 17:19:07 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-11-16 18:58:42 +0100
commit0c84af603894049dd8edd95da18d8990ab0516d1 (patch)
tree08fb4ad2b3a498e2edac73972f97e427e0194759 /tests
parent282f315c6ba3643c8c614220ab2f7e1d55de3658 (diff)
downloadfuzzycat-0c84af603894049dd8edd95da18d8990ab0516d1.tar.gz
fuzzycat-0c84af603894049dd8edd95da18d8990ab0516d1.zip
turn "match_release_fuzzy" into a class
Goal of this refactoring was to make the matching process a bit more configurable by using a class and a cascade of queries. For a limited test set: `FuzzyReleaseMatcher.match` is works the same as `match_release_fuzzy`.
Diffstat (limited to 'tests')
-rw-r--r--tests/files/README.md5
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml13
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml13
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml16
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml16
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml16
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml14
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml14
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml17
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml14
-rw-r--r--tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml17
-rw-r--r--tests/test_grobid_unstructured.py6
-rw-r--r--tests/test_matching.py123
16 files changed, 323 insertions, 12 deletions
diff --git a/tests/files/README.md b/tests/files/README.md
new file mode 100644
index 0000000..ef674d6
--- /dev/null
+++ b/tests/files/README.md
@@ -0,0 +1,5 @@
+# Matcher Test Files
+
+The goal here is to have a mostly language-independent test cases for matching.
+
+Each subdirectory corresponds to a test function and contains examples for it.
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml
new file mode 100644
index 0000000..2df8d9a
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml
@@ -0,0 +1,13 @@
+title: titles are case insensitive
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Michael Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml
new file mode 100644
index 0000000..1070408
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml
@@ -0,0 +1,13 @@
+title: another vanilla query
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Poul-Henning Kamp"
+ }
+ ],
+ "title": "The hyperdimensional tar pit",
+ "ext_ids": {}
+ }
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml
new file mode 100644
index 0000000..882e746
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml
@@ -0,0 +1,16 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Maurice Florence"
+ },
+ {
+ "raw_name": "Tuomo Tiisala"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml
new file mode 100644
index 0000000..0a2ad12
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml
@@ -0,0 +1,16 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "Maurice Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml
new file mode 100644
index 0000000..36ea0fe
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml
@@ -0,0 +1,16 @@
+title: short version of name should not work
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "M. Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+expected: 0
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml
new file mode 100644
index 0000000..07230e8
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml
@@ -0,0 +1,14 @@
+title: titles are case insensitive
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Michael Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml
new file mode 100644
index 0000000..62e9586
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml
@@ -0,0 +1,14 @@
+title: another vanilla query
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Poul-Henning Kamp"
+ }
+ ],
+ "title": "The hyperdimensional tar pit",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 2
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml
new file mode 100644
index 0000000..b89e825
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml
@@ -0,0 +1,17 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Maurice Florence"
+ },
+ {
+ "raw_name": "Tuomo Tiisala"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml
new file mode 100644
index 0000000..3de7262
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml
@@ -0,0 +1,17 @@
+title: order of contribs does not matter
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "Maurice Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 1
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml
new file mode 100644
index 0000000..39fb065
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml
@@ -0,0 +1,17 @@
+title: short version of name should not work
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Tuomo Tiisala"
+ },
+ {
+ "raw_name": "M. Florence"
+ }
+ ],
+ "title": "Foucault",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 1.0
+expected: 0
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml
new file mode 100644
index 0000000..fff19fa
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml
@@ -0,0 +1,17 @@
+title: here, Iz Beltagy is missing from author, but still retrieved
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Arman Cohan"
+ },
+ {
+ "raw_name": "Kyle Lo"
+ }
+ ],
+ "title": "SciBERT: A Pretrained Language Model for Scientific Text",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 0.5
+expected: 3
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml
new file mode 100644
index 0000000..d4e0025
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml
@@ -0,0 +1,14 @@
+title: here, 2/3 authors are missing, we fail with jaccard index 0.5
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Arman Cohan"
+ }
+ ],
+ "title": "SciBERT: A Pretrained Language Model for Scientific Text",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 0.5
+expected: 0
diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml
new file mode 100644
index 0000000..23d5a8d
--- /dev/null
+++ b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml
@@ -0,0 +1,17 @@
+title: match, despite trailing whitespace
+date: 2021-11-08
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Arman Cohan"
+ },
+ {
+ "raw_name": "Kyle Lo"
+ }
+ ],
+ "title": "SciBERT: A Pretrained Language Model for Scientific Text ",
+ "ext_ids": {}
+ }
+jaccard_index_threshold: 0.5
+expected: 3
diff --git a/tests/test_grobid_unstructured.py b/tests/test_grobid_unstructured.py
index cf71f91..f36f9a4 100644
--- a/tests/test_grobid_unstructured.py
+++ b/tests/test_grobid_unstructured.py
@@ -18,11 +18,7 @@ def test_grobid_ref_to_release():
given_name='ahab',
surname='sailor',
),
- GrobidAuthor(
- full_name='mary jane',
- given_name='mary',
- surname='jane'
- ),
+ GrobidAuthor(full_name='mary jane', given_name='mary', surname='jane'),
],
)
r = grobid_ref_to_release(d)
diff --git a/tests/test_matching.py b/tests/test_matching.py
index ad971a5..ca94c2a 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -1,13 +1,14 @@
+import collections
import logging
import warnings
import elasticsearch
import pytest
import requests
-from fatcat_openapi_client import ReleaseEntity
+from fatcat_openapi_client import ReleaseEntity, ReleaseContrib
-from fuzzycat.entities import entity_from_dict
-from fuzzycat.matching import anything_to_entity, match_release_fuzzy
+from fuzzycat.entities import entity_from_dict, entity_from_json
+from fuzzycat.matching import anything_to_entity, match_release_fuzzy, FuzzyReleaseMatcher
warnings.filterwarnings(
"ignore") # InsecureRequestWarning: Unverified HTTPS request is being made to host ...
@@ -18,6 +19,9 @@ from fatcat_openapi_client import ReleaseEntity
import pytest
import elasticsearch
import logging
+import yaml
+import glob
+import json
logger = logging.getLogger('test_matching')
logger.setLevel(logging.DEBUG)
@@ -40,19 +44,35 @@ def is_reachable(url, timeout=3):
return False
+def yaml_to_cases(klass,
+ files="tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml"):
+ """
+ Turn yaml files into a collection of named tuple test cases. The glob is
+ relative to the project root (i.e. where you usually run `pytest` from).
+ """
+ cases = []
+ for path in glob.glob(files):
+ with open(path) as f:
+ doc = yaml.load(f, Loader=yaml.Loader)
+ cases.append(klass(**doc))
+ return cases
+
+
@pytest.fixture
def es_client():
return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
-@pytest.mark.skipif(
- is_not_reachable(FATCAT_SEARCH_URL),
- reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".
- format(FATCAT_SEARCH_URL))
+# @pytest.mark.skipif(
+# is_not_reachable(FATCAT_SEARCH_URL),
+# reason="{} not reachable, use e.g. FUZZYCAT_FATCAT_SEARCH_URL=localhost:9200 to override".
+# format(FATCAT_SEARCH_URL))
def test_match_release_fuzzy(es_client, caplog):
"""
This test is tied to the current index contents, so if that changes, this
test may fail as well.
+
+ Note: Deprecated. We want to get rid of this.
"""
cases = (
("wtv64ahbdzgwnan7rllwr3nurm", 1),
@@ -106,3 +126,92 @@ def test_match_release_fuzzy(es_client, caplog):
logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
[v.title for v in result]))
assert len(result) == count, doc
+
+
+def test_matcher_match_release(es_client, caplog):
+ cases = (
+ ("wtv64ahbdzgwnan7rllwr3nurm", 1),
+ ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
+ )
+ matcher = FuzzyReleaseMatcher(es=es_client, size=5)
+ for i, (ident, count) in enumerate(cases):
+ entity = anything_to_entity(ident, ReleaseEntity)
+ result = matcher.match(entity)
+ logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
+ assert len(result) == count
+
+ # Partial data.
+ cases = (
+ ({
+ "title": "digital libraries",
+ "ext_ids": {}
+ }, 5),
+ ({
+ "title": "unlikelytitle",
+ "ext_ids": {}
+ }, 0),
+ ({
+ "title": "Imminent dystopia",
+ "ext_ids": {}
+ }, 5),
+ ({
+ "title": "",
+ "contribs": [{
+ "raw_name": "Aristoteles"
+ }],
+ "ext_ids": {}
+ }, 5),
+ # ({
+ # "title": "Letter",
+ # "contribs": [{"raw_name": "Claudel"}],
+ # "ext_ids": {}
+ # }, 1),
+ # ({
+ # "title": "The Future of Digital Scholarship",
+ # "contribs": [{
+ # "raw_name": "Costantino Thanos"
+ # }],
+ # "ext_ids": {}
+ # }, 5),
+ )
+ for i, (doc, count) in enumerate(cases):
+ entity = entity_from_dict(doc, ReleaseEntity)
+ result = matcher.match(entity)
+ with caplog.at_level(logging.INFO):
+ logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
+ [v.title for v in result]))
+ assert len(result) == count, doc
+
+
+def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog):
+ matcher = FuzzyReleaseMatcher(es=es_client)
+ cases = (
+ ("wtv64ahbdzgwnan7rllwr3nurm", 1),
+ ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
+ )
+ for i, (ident, count) in enumerate(cases):
+ entity = anything_to_entity(ident, ReleaseEntity)
+ result = matcher.match_release_by_id(entity)
+ assert len(result) == count
+
+
+def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog):
+ matcher = FuzzyReleaseMatcher(es=es_client)
+ Case = collections.namedtuple("Case", "title date input expected")
+ cases = yaml_to_cases(
+ Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml")
+ for i, c in enumerate(cases):
+ entity = entity_from_json(c.input, ReleaseEntity)
+ result = matcher.match_release_exact_title_exact_contrib(entity)
+ assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
+
+
+def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog):
+ matcher = FuzzyReleaseMatcher(es=es_client)
+ Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected")
+ cases = yaml_to_cases(
+ Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml")
+ for i, c in enumerate(cases):
+ entity = entity_from_json(c.input, ReleaseEntity)
+ result = matcher.match_release_exact_title_partial_contrib(entity)
+ assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)