From dd6149140542585f2b0bfc3b334ec2b0a88b790e Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 17 Nov 2021 14:51:50 +0100 Subject: complete FuzzyReleaseMatcher refactoring We keep the name, since the api - "matcher.match(release)" - is the same; simplified queries; at most one query is performed against elasticsearch; parallel release retrieval from the API; optional support for release year windows; Test cases are expressed in yaml and will be auto-loaded from the specified directory; test work against the current search endpoint, which means the actual output may change on index updates; for the moment, we think this setup is relatively simple and not too unstable. about: title contrib, partial name input: > { "contribs": [ { "raw_name": "Adams" } ], "title": "digital libraries", "ext_ids": {} } release_year_padding: 1 expected: - 7rmvqtrb2jdyhcxxodihzzcugy - a2u6ougtsjcbvczou6sazsulcm - dy45vilej5diros6zmax46nm4e - exuwhhayird4fdjmmsiqpponlq - gqrj7jikezgcfpjfazhpf4e7c4 - mkmqt3453relbpuyktnmsg6hjq - t2g5sl3dgzchtnq7dynxyzje44 - t4tvenhrvzamraxrvvxivxmvga - wd3oeoi3bffknfbg2ymleqc4ja - y63a6dhrfnb7bltlxfynydbojy --- tests/files/simple_fuzzy_release_matcher/0.yaml | 16 +++ tests/files/simple_fuzzy_release_matcher/1.yaml | 24 ++++ tests/files/simple_fuzzy_release_matcher/2.yaml | 24 ++++ tests/files/simple_fuzzy_release_matcher/3.yaml | 19 ++++ tests/files/simple_fuzzy_release_matcher/4.yaml | 16 +++ tests/files/simple_fuzzy_release_matcher/5.yaml | 16 +++ tests/files/simple_fuzzy_release_matcher/6.yaml | 24 ++++ tests/files/simple_fuzzy_release_matcher/7.yaml | 10 ++ tests/files/simple_fuzzy_release_matcher/8.yaml | 139 ++++++++++++++++++++++++ tests/files/simple_fuzzy_release_matcher/9.yaml | 139 ++++++++++++++++++++++++ 10 files changed, 427 insertions(+) create mode 100644 tests/files/simple_fuzzy_release_matcher/0.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/1.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/2.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/3.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/4.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/5.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/6.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/7.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/8.yaml create mode 100644 tests/files/simple_fuzzy_release_matcher/9.yaml (limited to 'tests/files') diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml new file mode 100644 index 0000000..71fc992 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/0.yaml @@ -0,0 +1,16 @@ +about: title and contrib +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - upm5nljirrbsfenoyxsisciltq + - wd3oeoi3bffknfbg2ymleqc4ja diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/1.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/2.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml new file mode 100644 index 0000000..1ab761b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/3.yaml @@ -0,0 +1,19 @@ +about: title only +input: > + { + "title": "The future of scholarly communications", + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '2f57funqizf4lcxjanls45upom' + - '3p2hngx6kfa33bdaobipimdzhe' + - '75dzcdywlbb3logmrrpkabanfa' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'opoxzl3zzbccdh5tptm5p2krem' + - 'umzryrtocbakberuubjm2pgxum' + - 'zb4bjnwqsveyzcwebvvmnsoq7u' + diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml new file mode 100644 index 0000000..9419406 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/4.yaml @@ -0,0 +1,16 @@ +about: title, year +input: > + { + "title": "The future of scholarly communications", + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '66r4s55dpvht5jghwkhupai2km' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'du4awowpsbbcjlo2pe6dvmxewu' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'xsuxmk5dyba6rnkeslipxxdlzi' diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml new file mode 100644 index 0000000..1eb435b --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/5.yaml @@ -0,0 +1,16 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Lissandrini" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 'xfhjsixnlvbibigrilisqqvfk4' + - 'zfhfpo2shrdexpgd2as4fz7wnm' + - 'cyct2bqs5feqbowg6ovv53pdfq' diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml new file mode 100644 index 0000000..ae52b23 --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/6.yaml @@ -0,0 +1,24 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Goodwin" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - 2bbtr4cltbgannqc6vqijvvzdq + - 34i2hba6tzf3xomobhumfkkvga + - 62sz5fhhuvenpfctf6wejl5m2i + - chnqmdm4yfd4zk6kawujvsbhwy + - chs7be23vfdthk3xre54w534zm + - f5lp3nipazhyxoa2xarlomkofm + - hikujb5wmvasnoat2myt56l63y + - qbom7rwqtzfypa5hltgbx4e2iq + - qh44drz3bvg2ndzwzc55xops7y + - r4n57quetbf7tddwodjauegmzq + diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml new file mode 100644 index 0000000..2330f0d --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/7.yaml @@ -0,0 +1,10 @@ +about: just a subtitle +input: > + { + "subtitle": "topographies parisiennes", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/8.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/simple_fuzzy_release_matcher/9.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu -- cgit v1.2.3