aboutsummaryrefslogtreecommitdiffstats
path: root/tests
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-11-17 14:51:50 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-12-06 19:53:30 +0100
commitdd6149140542585f2b0bfc3b334ec2b0a88b790e (patch)
tree6a11c228558cfbf73932bc828cda9be3735cfd78 /tests
parentd104f8d0ba8eef5563555de82be66bbf17f961db (diff)
downloadfuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.tar.gz
fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.zip
complete FuzzyReleaseMatcher refactoring
We keep the name, since the api - "matcher.match(release)" - is the same; simplified queries; at most one query is performed against elasticsearch; parallel release retrieval from the API; optional support for release year windows; Test cases are expressed in yaml and will be auto-loaded from the specified directory; test work against the current search endpoint, which means the actual output may change on index updates; for the moment, we think this setup is relatively simple and not too unstable. about: title contrib, partial name input: > { "contribs": [ { "raw_name": "Adams" } ], "title": "digital libraries", "ext_ids": {} } release_year_padding: 1 expected: - 7rmvqtrb2jdyhcxxodihzzcugy - a2u6ougtsjcbvczou6sazsulcm - dy45vilej5diros6zmax46nm4e - exuwhhayird4fdjmmsiqpponlq - gqrj7jikezgcfpjfazhpf4e7c4 - mkmqt3453relbpuyktnmsg6hjq - t2g5sl3dgzchtnq7dynxyzje44 - t4tvenhrvzamraxrvvxivxmvga - wd3oeoi3bffknfbg2ymleqc4ja - y63a6dhrfnb7bltlxfynydbojy
Diffstat (limited to 'tests')
-rw-r--r--tests/files/simple_fuzzy_release_matcher/0.yaml16
-rw-r--r--tests/files/simple_fuzzy_release_matcher/1.yaml24
-rw-r--r--tests/files/simple_fuzzy_release_matcher/2.yaml24
-rw-r--r--tests/files/simple_fuzzy_release_matcher/3.yaml19
-rw-r--r--tests/files/simple_fuzzy_release_matcher/4.yaml16
-rw-r--r--tests/files/simple_fuzzy_release_matcher/5.yaml16
-rw-r--r--tests/files/simple_fuzzy_release_matcher/6.yaml24
-rw-r--r--tests/files/simple_fuzzy_release_matcher/7.yaml10
-rw-r--r--tests/files/simple_fuzzy_release_matcher/8.yaml139
-rw-r--r--tests/files/simple_fuzzy_release_matcher/9.yaml139
-rw-r--r--tests/test_matching.py94
11 files changed, 437 insertions, 84 deletions
diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml
new file mode 100644
index 0000000..71fc992
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/0.yaml
@@ -0,0 +1,16 @@
+about: title and contrib
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Michael Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 7rmvqtrb2jdyhcxxodihzzcugy
+ - upm5nljirrbsfenoyxsisciltq
+ - wd3oeoi3bffknfbg2ymleqc4ja
diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml
new file mode 100644
index 0000000..df6a954
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/1.yaml
@@ -0,0 +1,24 @@
+about: title contrib, partial name
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 7rmvqtrb2jdyhcxxodihzzcugy
+ - a2u6ougtsjcbvczou6sazsulcm
+ - dy45vilej5diros6zmax46nm4e
+ - exuwhhayird4fdjmmsiqpponlq
+ - gqrj7jikezgcfpjfazhpf4e7c4
+ - mkmqt3453relbpuyktnmsg6hjq
+ - t2g5sl3dgzchtnq7dynxyzje44
+ - t4tvenhrvzamraxrvvxivxmvga
+ - wd3oeoi3bffknfbg2ymleqc4ja
+ - y63a6dhrfnb7bltlxfynydbojy
+
diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml
new file mode 100644
index 0000000..df6a954
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/2.yaml
@@ -0,0 +1,24 @@
+about: title contrib, partial name
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Adams"
+ }
+ ],
+ "title": "digital libraries",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 7rmvqtrb2jdyhcxxodihzzcugy
+ - a2u6ougtsjcbvczou6sazsulcm
+ - dy45vilej5diros6zmax46nm4e
+ - exuwhhayird4fdjmmsiqpponlq
+ - gqrj7jikezgcfpjfazhpf4e7c4
+ - mkmqt3453relbpuyktnmsg6hjq
+ - t2g5sl3dgzchtnq7dynxyzje44
+ - t4tvenhrvzamraxrvvxivxmvga
+ - wd3oeoi3bffknfbg2ymleqc4ja
+ - y63a6dhrfnb7bltlxfynydbojy
+
diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml
new file mode 100644
index 0000000..1ab761b
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/3.yaml
@@ -0,0 +1,19 @@
+about: title only
+input: >
+ {
+ "title": "The future of scholarly communications",
+ "ext_ids": {}
+ }
+release_year_padding: 0
+expected:
+ - '2f57funqizf4lcxjanls45upom'
+ - '3p2hngx6kfa33bdaobipimdzhe'
+ - '75dzcdywlbb3logmrrpkabanfa'
+ - 'ccoocm7uzjgwnlpfk5fbwfudjm'
+ - 'nfydgfziuvhete6p3lrn4u325u'
+ - 'ntpiporu75bendibjku4kjmd5q'
+ - 'op6a5fclonhrxm3zlo6ub2tlw4'
+ - 'opoxzl3zzbccdh5tptm5p2krem'
+ - 'umzryrtocbakberuubjm2pgxum'
+ - 'zb4bjnwqsveyzcwebvvmnsoq7u'
+
diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml
new file mode 100644
index 0000000..9419406
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/4.yaml
@@ -0,0 +1,16 @@
+about: title, year
+input: >
+ {
+ "title": "The future of scholarly communications",
+ "release_year": 2014,
+ "ext_ids": {}
+ }
+release_year_padding: 0
+expected:
+ - '66r4s55dpvht5jghwkhupai2km'
+ - 'ccoocm7uzjgwnlpfk5fbwfudjm'
+ - 'du4awowpsbbcjlo2pe6dvmxewu'
+ - 'nfydgfziuvhete6p3lrn4u325u'
+ - 'ntpiporu75bendibjku4kjmd5q'
+ - 'op6a5fclonhrxm3zlo6ub2tlw4'
+ - 'xsuxmk5dyba6rnkeslipxxdlzi'
diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml
new file mode 100644
index 0000000..1eb435b
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/5.yaml
@@ -0,0 +1,16 @@
+about: contrib, year
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Lissandrini"
+ }
+ ],
+ "release_year": 2014,
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - 'xfhjsixnlvbibigrilisqqvfk4'
+ - 'zfhfpo2shrdexpgd2as4fz7wnm'
+ - 'cyct2bqs5feqbowg6ovv53pdfq'
diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml
new file mode 100644
index 0000000..ae52b23
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/6.yaml
@@ -0,0 +1,24 @@
+about: contrib, year
+input: >
+ {
+ "contribs": [
+ {
+ "raw_name": "Goodwin"
+ }
+ ],
+ "release_year": 2014,
+ "ext_ids": {}
+ }
+release_year_padding: 0
+expected:
+ - 2bbtr4cltbgannqc6vqijvvzdq
+ - 34i2hba6tzf3xomobhumfkkvga
+ - 62sz5fhhuvenpfctf6wejl5m2i
+ - chnqmdm4yfd4zk6kawujvsbhwy
+ - chs7be23vfdthk3xre54w534zm
+ - f5lp3nipazhyxoa2xarlomkofm
+ - hikujb5wmvasnoat2myt56l63y
+ - qbom7rwqtzfypa5hltgbx4e2iq
+ - qh44drz3bvg2ndzwzc55xops7y
+ - r4n57quetbf7tddwodjauegmzq
+
diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml
new file mode 100644
index 0000000..2330f0d
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/7.yaml
@@ -0,0 +1,10 @@
+about: just a subtitle
+input: >
+ {
+ "subtitle": "topographies parisiennes",
+ "ext_ids": {}
+ }
+release_year_padding: 1
+expected:
+ - yvqtz2zvkzcbpj4jxrp7bvydfu
+ - lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml
new file mode 100644
index 0000000..b43e53a
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/8.yaml
@@ -0,0 +1,139 @@
+about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu
+input: >
+ {
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Annelies Schulte Nordholt",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Uopen Journals",
+ "pages": "66",
+ "ext_ids": {
+ "doi": "10.18352/bmgn-lchr.128"
+ },
+ "release_year": 2008,
+ "release_date": "2008-02-19",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "sm7svbj64vc55gj4p23t7c3lrm",
+ "webcaptures": [],
+ "filesets": [],
+ "files": [
+ {
+ "release_ids": [
+ "yvqtz2zvkzcbpj4jxrp7bvydfu"
+ ],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "publisher"
+ },
+ {
+ "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "webarchive"
+ }
+ ],
+ "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23",
+ "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40",
+ "md5": "7dae3ec6c1d65cae6a91554071cc9625",
+ "size": 889420,
+ "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75",
+ "ident": "oew6z4a6gvfqxc5kiy2r62ucfq",
+ "state": "active"
+ }
+ ],
+ "container": {
+ "wikidata_qid": "Q15763709",
+ "issnp": "1873-5045",
+ "issne": "1873-5045",
+ "issnl": "1873-5045",
+ "publisher": "Uopen Journals",
+ "name": "Relief: Revue Électronique de Littérature Francaise",
+ "extra": {
+ "country": "nl",
+ "default_license": "CC-BY",
+ "doaj": {
+ "as_of": "2021-11-20",
+ "default_license": "CC-BY",
+ "seal": false
+ },
+ "kbart": {
+ "clockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2016
+ ]
+ ]
+ },
+ "lockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2019
+ ]
+ ]
+ },
+ "pkp_pln": {
+ "year_spans": [
+ [
+ 2007,
+ 2021
+ ]
+ ]
+ },
+ "portico": {
+ "year_spans": [
+ [
+ 2007,
+ 2017
+ ]
+ ]
+ }
+ },
+ "languages": [
+ "en"
+ ],
+ "publisher_type": "unipress",
+ "road": {
+ "as_of": "2018-01-24"
+ },
+ "sherpa_romeo": {
+ "color": "blue"
+ },
+ "szczepanski": {
+ "as_of": "2018"
+ },
+ "urls": [
+ "https://www.revue-relief.org/",
+ "http://www.revue-relief.org/index.php/relief",
+ "http://www.revue-relief.org/index.php/relief/about"
+ ]
+ },
+ "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e",
+ "ident": "sm7svbj64vc55gj4p23t7c3lrm",
+ "state": "active"
+ },
+ "work_id": "qcpd2i2txfdi5emqb7fxsawk6e",
+ "title": "Georges Perec: topographies parisiennes du flâneur",
+ "state": "active",
+ "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu",
+ "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff",
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ }
+ }
+release_year_padding: 1
+expected:
+ - yvqtz2zvkzcbpj4jxrp7bvydfu
+ - lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml
new file mode 100644
index 0000000..b43e53a
--- /dev/null
+++ b/tests/files/simple_fuzzy_release_matcher/9.yaml
@@ -0,0 +1,139 @@
+about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu
+input: >
+ {
+ "abstracts": [],
+ "refs": [],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "Annelies Schulte Nordholt",
+ "role": "author",
+ "extra": {
+ "seq": "first"
+ }
+ }
+ ],
+ "publisher": "Uopen Journals",
+ "pages": "66",
+ "ext_ids": {
+ "doi": "10.18352/bmgn-lchr.128"
+ },
+ "release_year": 2008,
+ "release_date": "2008-02-19",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "container_id": "sm7svbj64vc55gj4p23t7c3lrm",
+ "webcaptures": [],
+ "filesets": [],
+ "files": [
+ {
+ "release_ids": [
+ "yvqtz2zvkzcbpj4jxrp7bvydfu"
+ ],
+ "mimetype": "application/pdf",
+ "urls": [
+ {
+ "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "publisher"
+ },
+ {
+ "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/",
+ "rel": "webarchive"
+ }
+ ],
+ "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23",
+ "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40",
+ "md5": "7dae3ec6c1d65cae6a91554071cc9625",
+ "size": 889420,
+ "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75",
+ "ident": "oew6z4a6gvfqxc5kiy2r62ucfq",
+ "state": "active"
+ }
+ ],
+ "container": {
+ "wikidata_qid": "Q15763709",
+ "issnp": "1873-5045",
+ "issne": "1873-5045",
+ "issnl": "1873-5045",
+ "publisher": "Uopen Journals",
+ "name": "Relief: Revue Électronique de Littérature Francaise",
+ "extra": {
+ "country": "nl",
+ "default_license": "CC-BY",
+ "doaj": {
+ "as_of": "2021-11-20",
+ "default_license": "CC-BY",
+ "seal": false
+ },
+ "kbart": {
+ "clockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2016
+ ]
+ ]
+ },
+ "lockss": {
+ "year_spans": [
+ [
+ 2007,
+ 2019
+ ]
+ ]
+ },
+ "pkp_pln": {
+ "year_spans": [
+ [
+ 2007,
+ 2021
+ ]
+ ]
+ },
+ "portico": {
+ "year_spans": [
+ [
+ 2007,
+ 2017
+ ]
+ ]
+ }
+ },
+ "languages": [
+ "en"
+ ],
+ "publisher_type": "unipress",
+ "road": {
+ "as_of": "2018-01-24"
+ },
+ "sherpa_romeo": {
+ "color": "blue"
+ },
+ "szczepanski": {
+ "as_of": "2018"
+ },
+ "urls": [
+ "https://www.revue-relief.org/",
+ "http://www.revue-relief.org/index.php/relief",
+ "http://www.revue-relief.org/index.php/relief/about"
+ ]
+ },
+ "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e",
+ "ident": "sm7svbj64vc55gj4p23t7c3lrm",
+ "state": "active"
+ },
+ "work_id": "qcpd2i2txfdi5emqb7fxsawk6e",
+ "title": "Georges Perec: topographies parisiennes du flâneur",
+ "state": "active",
+ "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu",
+ "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff",
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ }
+ }
+release_year_padding: 1
+expected:
+ - yvqtz2zvkzcbpj4jxrp7bvydfu
+ - lttg27o7mjganpkhrgy3xyv7vu
diff --git a/tests/test_matching.py b/tests/test_matching.py
index a7754ee..b9d7fae 100644
--- a/tests/test_matching.py
+++ b/tests/test_matching.py
@@ -48,90 +48,16 @@ def es_client():
return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL])
-def test_matcher_match_release(es_client, caplog):
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- matcher = FuzzyReleaseMatcher(es=es_client, size=5)
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
- result = matcher.match(entity)
- logger.info("[{}] given {}, found {}".format(i, entity.title, len(result)))
- assert len(result) == count
-
- # Partial data.
- cases = (
- ({
- "title": "digital libraries",
- "ext_ids": {}
- }, 5),
- ({
- "title": "unlikelytitle",
- "ext_ids": {}
- }, 0),
- ({
- "title": "Imminent dystopia",
- "ext_ids": {}
- }, 5),
- ({
- "title": "",
- "contribs": [{
- "raw_name": "Aristoteles"
- }],
- "ext_ids": {}
- }, 5),
- # ({
- # "title": "Letter",
- # "contribs": [{"raw_name": "Claudel"}],
- # "ext_ids": {}
- # }, 1),
- # ({
- # "title": "The Future of Digital Scholarship",
- # "contribs": [{
- # "raw_name": "Costantino Thanos"
- # }],
- # "ext_ids": {}
- # }, 5),
- )
- for i, (doc, count) in enumerate(cases):
- entity = entity_from_dict(doc, ReleaseEntity)
- result = matcher.match(entity)
- with caplog.at_level(logging.INFO):
- logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result),
- [v.title for v in result]))
- assert len(result) == count, doc
-
-
-def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog):
- matcher = FuzzyReleaseMatcher(es=es_client)
- cases = (
- ("wtv64ahbdzgwnan7rllwr3nurm", 1),
- ("eqcgtpav3na5jh56o5vjsvb4ei", 1),
- )
- for i, (ident, count) in enumerate(cases):
- entity = anything_to_entity(ident, ReleaseEntity)
- result = matcher.match_release_by_id(entity)
- assert len(result) == count
-
-
-def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog):
- matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "title date input expected")
- cases = yaml_to_cases(
- Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml")
- for i, c in enumerate(cases):
- entity = entity_from_json(c.input, ReleaseEntity)
- result = matcher.match_release_exact_title_exact_contrib(entity)
- assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
-
-
-def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog):
+def test_simple_fuzzy_release_matcher(es_client, caplog):
+ """
+ Use a single test function to test the higher level match function. We want
+ the result to be sensible, but should also document broken examples here.
+ """
matcher = FuzzyReleaseMatcher(es=es_client)
- Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected")
- cases = yaml_to_cases(
- Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml")
+ Case = collections.namedtuple("Case", "about input release_year_padding expected")
+ cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml")
for i, c in enumerate(cases):
+ matcher.release_year_padding = c.release_year_padding
entity = entity_from_json(c.input, ReleaseEntity)
- result = matcher.match_release_exact_title_partial_contrib(entity)
- assert len(result) == c.expected, "[{}] {}".format(c.title, c.input)
+ result = matcher.match(entity)
+ assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input)