diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-11-17 14:51:50 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-12-06 19:53:30 +0100 |
commit | dd6149140542585f2b0bfc3b334ec2b0a88b790e (patch) | |
tree | 6a11c228558cfbf73932bc828cda9be3735cfd78 /tests | |
parent | d104f8d0ba8eef5563555de82be66bbf17f961db (diff) | |
download | fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.tar.gz fuzzycat-dd6149140542585f2b0bfc3b334ec2b0a88b790e.zip |
complete FuzzyReleaseMatcher refactoring
We keep the name, since the api - "matcher.match(release)" - is the
same; simplified queries; at most one query is performed against
elasticsearch; parallel release retrieval from the API; optional support
for release year windows;
Test cases are expressed in yaml and will be auto-loaded from the
specified directory; test work against the current search endpoint,
which means the actual output may change on index updates; for the
moment, we think this setup is relatively simple and not too unstable.
about: title contrib, partial name
input: >
{
"contribs": [
{
"raw_name": "Adams"
}
],
"title": "digital libraries",
"ext_ids": {}
}
release_year_padding: 1
expected:
- 7rmvqtrb2jdyhcxxodihzzcugy
- a2u6ougtsjcbvczou6sazsulcm
- dy45vilej5diros6zmax46nm4e
- exuwhhayird4fdjmmsiqpponlq
- gqrj7jikezgcfpjfazhpf4e7c4
- mkmqt3453relbpuyktnmsg6hjq
- t2g5sl3dgzchtnq7dynxyzje44
- t4tvenhrvzamraxrvvxivxmvga
- wd3oeoi3bffknfbg2ymleqc4ja
- y63a6dhrfnb7bltlxfynydbojy
Diffstat (limited to 'tests')
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/0.yaml | 16 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/1.yaml | 24 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/2.yaml | 24 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/3.yaml | 19 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/4.yaml | 16 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/5.yaml | 16 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/6.yaml | 24 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/7.yaml | 10 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/8.yaml | 139 | ||||
-rw-r--r-- | tests/files/simple_fuzzy_release_matcher/9.yaml | 139 | ||||
-rw-r--r-- | tests/test_matching.py | 94 |
11 files changed, 437 insertions, 84 deletions
diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml new file mode 100644 index 0000000..71fc992 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/0.yaml @@ -0,0 +1,16 @@ +about: title and contrib +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - upm5nljirrbsfenoyxsisciltq + - wd3oeoi3bffknfbg2ymleqc4ja diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/1.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/2.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml new file mode 100644 index 0000000..1ab761b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/3.yaml @@ -0,0 +1,19 @@ +about: title only +input: > + { + "title": "The future of scholarly communications", + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '2f57funqizf4lcxjanls45upom' + - '3p2hngx6kfa33bdaobipimdzhe' + - '75dzcdywlbb3logmrrpkabanfa' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'opoxzl3zzbccdh5tptm5p2krem' + - 'umzryrtocbakberuubjm2pgxum' + - 'zb4bjnwqsveyzcwebvvmnsoq7u' + diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml new file mode 100644 index 0000000..9419406 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/4.yaml @@ -0,0 +1,16 @@ +about: title, year +input: > + { + "title": "The future of scholarly communications", + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '66r4s55dpvht5jghwkhupai2km' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'du4awowpsbbcjlo2pe6dvmxewu' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'xsuxmk5dyba6rnkeslipxxdlzi' diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml new file mode 100644 index 0000000..1eb435b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/5.yaml @@ -0,0 +1,16 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Lissandrini" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 'xfhjsixnlvbibigrilisqqvfk4' + - 'zfhfpo2shrdexpgd2as4fz7wnm' + - 'cyct2bqs5feqbowg6ovv53pdfq' diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml new file mode 100644 index 0000000..ae52b23 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/6.yaml @@ -0,0 +1,24 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Goodwin" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - 2bbtr4cltbgannqc6vqijvvzdq + - 34i2hba6tzf3xomobhumfkkvga + - 62sz5fhhuvenpfctf6wejl5m2i + - chnqmdm4yfd4zk6kawujvsbhwy + - chs7be23vfdthk3xre54w534zm + - f5lp3nipazhyxoa2xarlomkofm + - hikujb5wmvasnoat2myt56l63y + - qbom7rwqtzfypa5hltgbx4e2iq + - qh44drz3bvg2ndzwzc55xops7y + - r4n57quetbf7tddwodjauegmzq + diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml new file mode 100644 index 0000000..2330f0d --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/7.yaml @@ -0,0 +1,10 @@ +about: just a subtitle +input: > + { + "subtitle": "topographies parisiennes", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/8.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/9.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/test_matching.py b/tests/test_matching.py index a7754ee..b9d7fae 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -48,90 +48,16 @@ def es_client(): return elasticsearch.Elasticsearch([FATCAT_SEARCH_URL]) -def test_matcher_match_release(es_client, caplog): - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - matcher = FuzzyReleaseMatcher(es=es_client, size=5) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - result = matcher.match(entity) - logger.info("[{}] given {}, found {}".format(i, entity.title, len(result))) - assert len(result) == count - - # Partial data. - cases = ( - ({ - "title": "digital libraries", - "ext_ids": {} - }, 5), - ({ - "title": "unlikelytitle", - "ext_ids": {} - }, 0), - ({ - "title": "Imminent dystopia", - "ext_ids": {} - }, 5), - ({ - "title": "", - "contribs": [{ - "raw_name": "Aristoteles" - }], - "ext_ids": {} - }, 5), - # ({ - # "title": "Letter", - # "contribs": [{"raw_name": "Claudel"}], - # "ext_ids": {} - # }, 1), - # ({ - # "title": "The Future of Digital Scholarship", - # "contribs": [{ - # "raw_name": "Costantino Thanos" - # }], - # "ext_ids": {} - # }, 5), - ) - for i, (doc, count) in enumerate(cases): - entity = entity_from_dict(doc, ReleaseEntity) - result = matcher.match(entity) - with caplog.at_level(logging.INFO): - logging.info("[{}] given title '{}', found {}, {}".format(i, entity.title, len(result), - [v.title for v in result])) - assert len(result) == count, doc - - -def test_fuzzy_release_matcher_match_release_by_id(es_client, caplog): - matcher = FuzzyReleaseMatcher(es=es_client) - cases = ( - ("wtv64ahbdzgwnan7rllwr3nurm", 1), - ("eqcgtpav3na5jh56o5vjsvb4ei", 1), - ) - for i, (ident, count) in enumerate(cases): - entity = anything_to_entity(ident, ReleaseEntity) - result = matcher.match_release_by_id(entity) - assert len(result) == count - - -def test_fuzzy_release_match_release_exact_title_exact_contrib(es_client, caplog): - matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "title date input expected") - cases = yaml_to_cases( - Case, "tests/files/fuzzy_release_match_release_exact_title_exact_contrib/*.yaml") - for i, c in enumerate(cases): - entity = entity_from_json(c.input, ReleaseEntity) - result = matcher.match_release_exact_title_exact_contrib(entity) - assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) - - -def test_fuzzy_release_match_release_exact_title_partial_contrib(es_client, caplog): +def test_simple_fuzzy_release_matcher(es_client, caplog): + """ + Use a single test function to test the higher level match function. We want + the result to be sensible, but should also document broken examples here. + """ matcher = FuzzyReleaseMatcher(es=es_client) - Case = collections.namedtuple("Case", "title date input jaccard_index_threshold expected") - cases = yaml_to_cases( - Case, "tests/files/fuzzy_release_match_release_exact_title_partial_contrib/*.yaml") + Case = collections.namedtuple("Case", "about input release_year_padding expected") + cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml") for i, c in enumerate(cases): + matcher.release_year_padding = c.release_year_padding entity = entity_from_json(c.input, ReleaseEntity) - result = matcher.match_release_exact_title_partial_contrib(entity) - assert len(result) == c.expected, "[{}] {}".format(c.title, c.input) + result = matcher.match(entity) + assert set([r.ident for r in result]) == set(c.expected), "[{}] {}".format(c.about, c.input) |