From 5bd8ee08a3e0f52893c1b7afa6bc4f062b7c062c Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 6 Dec 2021 19:59:51 +0100 Subject: matching: cleanup test files --- .../0.yaml | 13 -- .../1.yaml | 13 -- .../2.yaml | 16 --- .../3.yaml | 16 --- .../4.yaml | 16 --- .../0.yaml | 14 --- .../1.yaml | 14 --- .../2.yaml | 17 --- .../3.yaml | 17 --- .../4.yaml | 17 --- .../5.yaml | 17 --- .../6.yaml | 14 --- .../7.yaml | 17 --- tests/files/fuzzy_release_matcher/0.yaml | 16 +++ tests/files/fuzzy_release_matcher/1.yaml | 24 ++++ tests/files/fuzzy_release_matcher/2.yaml | 24 ++++ tests/files/fuzzy_release_matcher/3.yaml | 19 +++ tests/files/fuzzy_release_matcher/4.yaml | 16 +++ tests/files/fuzzy_release_matcher/5.yaml | 16 +++ tests/files/fuzzy_release_matcher/6.yaml | 24 ++++ tests/files/fuzzy_release_matcher/7.yaml | 10 ++ tests/files/fuzzy_release_matcher/8.yaml | 139 +++++++++++++++++++++ tests/files/fuzzy_release_matcher/9.yaml | 139 +++++++++++++++++++++ tests/files/simple_fuzzy_release_matcher/0.yaml | 16 --- tests/files/simple_fuzzy_release_matcher/1.yaml | 24 ---- tests/files/simple_fuzzy_release_matcher/2.yaml | 24 ---- tests/files/simple_fuzzy_release_matcher/3.yaml | 19 --- tests/files/simple_fuzzy_release_matcher/4.yaml | 16 --- tests/files/simple_fuzzy_release_matcher/5.yaml | 16 --- tests/files/simple_fuzzy_release_matcher/6.yaml | 24 ---- tests/files/simple_fuzzy_release_matcher/7.yaml | 10 -- tests/files/simple_fuzzy_release_matcher/8.yaml | 139 --------------------- tests/files/simple_fuzzy_release_matcher/9.yaml | 139 --------------------- tests/test_matching.py | 2 +- 34 files changed, 428 insertions(+), 629 deletions(-) delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml delete mode 100644 tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml create mode 100644 tests/files/fuzzy_release_matcher/0.yaml create mode 100644 tests/files/fuzzy_release_matcher/1.yaml create mode 100644 tests/files/fuzzy_release_matcher/2.yaml create mode 100644 tests/files/fuzzy_release_matcher/3.yaml create mode 100644 tests/files/fuzzy_release_matcher/4.yaml create mode 100644 tests/files/fuzzy_release_matcher/5.yaml create mode 100644 tests/files/fuzzy_release_matcher/6.yaml create mode 100644 tests/files/fuzzy_release_matcher/7.yaml create mode 100644 tests/files/fuzzy_release_matcher/8.yaml create mode 100644 tests/files/fuzzy_release_matcher/9.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/0.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/1.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/2.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/3.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/4.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/5.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/6.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/7.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/8.yaml delete mode 100644 tests/files/simple_fuzzy_release_matcher/9.yaml (limited to 'tests') diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml deleted file mode 100644 index 2df8d9a..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/0.yaml +++ /dev/null @@ -1,13 +0,0 @@ -title: titles are case insensitive -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Michael Adams" - } - ], - "title": "digital libraries", - "ext_ids": {} - } -expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml deleted file mode 100644 index 1070408..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/1.yaml +++ /dev/null @@ -1,13 +0,0 @@ -title: another vanilla query -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Poul-Henning Kamp" - } - ], - "title": "The hyperdimensional tar pit", - "ext_ids": {} - } -expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml deleted file mode 100644 index 882e746..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/2.yaml +++ /dev/null @@ -1,16 +0,0 @@ -title: order of contribs does not matter -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Maurice Florence" - }, - { - "raw_name": "Tuomo Tiisala" - } - ], - "title": "Foucault", - "ext_ids": {} - } -expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml deleted file mode 100644 index 0a2ad12..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/3.yaml +++ /dev/null @@ -1,16 +0,0 @@ -title: order of contribs does not matter -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Tuomo Tiisala" - }, - { - "raw_name": "Maurice Florence" - } - ], - "title": "Foucault", - "ext_ids": {} - } -expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml deleted file mode 100644 index 36ea0fe..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_exact_contrib/4.yaml +++ /dev/null @@ -1,16 +0,0 @@ -title: short version of name should not work -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Tuomo Tiisala" - }, - { - "raw_name": "M. Florence" - } - ], - "title": "Foucault", - "ext_ids": {} - } -expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml deleted file mode 100644 index 07230e8..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/0.yaml +++ /dev/null @@ -1,14 +0,0 @@ -title: titles are case insensitive -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Michael Adams" - } - ], - "title": "digital libraries", - "ext_ids": {} - } -jaccard_index_threshold: 1.0 -expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml deleted file mode 100644 index 62e9586..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/1.yaml +++ /dev/null @@ -1,14 +0,0 @@ -title: another vanilla query -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Poul-Henning Kamp" - } - ], - "title": "The hyperdimensional tar pit", - "ext_ids": {} - } -jaccard_index_threshold: 1.0 -expected: 2 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml deleted file mode 100644 index b89e825..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/2.yaml +++ /dev/null @@ -1,17 +0,0 @@ -title: order of contribs does not matter -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Maurice Florence" - }, - { - "raw_name": "Tuomo Tiisala" - } - ], - "title": "Foucault", - "ext_ids": {} - } -jaccard_index_threshold: 1.0 -expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml deleted file mode 100644 index 3de7262..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/3.yaml +++ /dev/null @@ -1,17 +0,0 @@ -title: order of contribs does not matter -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Tuomo Tiisala" - }, - { - "raw_name": "Maurice Florence" - } - ], - "title": "Foucault", - "ext_ids": {} - } -jaccard_index_threshold: 1.0 -expected: 1 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml deleted file mode 100644 index 39fb065..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/4.yaml +++ /dev/null @@ -1,17 +0,0 @@ -title: short version of name should not work -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Tuomo Tiisala" - }, - { - "raw_name": "M. Florence" - } - ], - "title": "Foucault", - "ext_ids": {} - } -jaccard_index_threshold: 1.0 -expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml deleted file mode 100644 index fff19fa..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/5.yaml +++ /dev/null @@ -1,17 +0,0 @@ -title: here, Iz Beltagy is missing from author, but still retrieved -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Arman Cohan" - }, - { - "raw_name": "Kyle Lo" - } - ], - "title": "SciBERT: A Pretrained Language Model for Scientific Text", - "ext_ids": {} - } -jaccard_index_threshold: 0.5 -expected: 3 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml deleted file mode 100644 index d4e0025..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/6.yaml +++ /dev/null @@ -1,14 +0,0 @@ -title: here, 2/3 authors are missing, we fail with jaccard index 0.5 -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Arman Cohan" - } - ], - "title": "SciBERT: A Pretrained Language Model for Scientific Text", - "ext_ids": {} - } -jaccard_index_threshold: 0.5 -expected: 0 diff --git a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml b/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml deleted file mode 100644 index 23d5a8d..0000000 --- a/tests/files/fuzzy_release_match_release_exact_title_partial_contrib/7.yaml +++ /dev/null @@ -1,17 +0,0 @@ -title: match, despite trailing whitespace -date: 2021-11-08 -input: > - { - "contribs": [ - { - "raw_name": "Arman Cohan" - }, - { - "raw_name": "Kyle Lo" - } - ], - "title": "SciBERT: A Pretrained Language Model for Scientific Text ", - "ext_ids": {} - } -jaccard_index_threshold: 0.5 -expected: 3 diff --git a/tests/files/fuzzy_release_matcher/0.yaml b/tests/files/fuzzy_release_matcher/0.yaml new file mode 100644 index 0000000..71fc992 --- /dev/null +++ b/tests/files/fuzzy_release_matcher/0.yaml @@ -0,0 +1,16 @@ +about: title and contrib +input: > + { + "contribs": [ + { + "raw_name": "Michael Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - upm5nljirrbsfenoyxsisciltq + - wd3oeoi3bffknfbg2ymleqc4ja diff --git a/tests/files/fuzzy_release_matcher/1.yaml b/tests/files/fuzzy_release_matcher/1.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/fuzzy_release_matcher/1.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/fuzzy_release_matcher/2.yaml b/tests/files/fuzzy_release_matcher/2.yaml new file mode 100644 index 0000000..df6a954 --- /dev/null +++ b/tests/files/fuzzy_release_matcher/2.yaml @@ -0,0 +1,24 @@ +about: title contrib, partial name +input: > + { + "contribs": [ + { + "raw_name": "Adams" + } + ], + "title": "digital libraries", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 7rmvqtrb2jdyhcxxodihzzcugy + - a2u6ougtsjcbvczou6sazsulcm + - dy45vilej5diros6zmax46nm4e + - exuwhhayird4fdjmmsiqpponlq + - gqrj7jikezgcfpjfazhpf4e7c4 + - mkmqt3453relbpuyktnmsg6hjq + - t2g5sl3dgzchtnq7dynxyzje44 + - t4tvenhrvzamraxrvvxivxmvga + - wd3oeoi3bffknfbg2ymleqc4ja + - y63a6dhrfnb7bltlxfynydbojy + diff --git a/tests/files/fuzzy_release_matcher/3.yaml b/tests/files/fuzzy_release_matcher/3.yaml new file mode 100644 index 0000000..1ab761b --- /dev/null +++ b/tests/files/fuzzy_release_matcher/3.yaml @@ -0,0 +1,19 @@ +about: title only +input: > + { + "title": "The future of scholarly communications", + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '2f57funqizf4lcxjanls45upom' + - '3p2hngx6kfa33bdaobipimdzhe' + - '75dzcdywlbb3logmrrpkabanfa' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'opoxzl3zzbccdh5tptm5p2krem' + - 'umzryrtocbakberuubjm2pgxum' + - 'zb4bjnwqsveyzcwebvvmnsoq7u' + diff --git a/tests/files/fuzzy_release_matcher/4.yaml b/tests/files/fuzzy_release_matcher/4.yaml new file mode 100644 index 0000000..9419406 --- /dev/null +++ b/tests/files/fuzzy_release_matcher/4.yaml @@ -0,0 +1,16 @@ +about: title, year +input: > + { + "title": "The future of scholarly communications", + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - '66r4s55dpvht5jghwkhupai2km' + - 'ccoocm7uzjgwnlpfk5fbwfudjm' + - 'du4awowpsbbcjlo2pe6dvmxewu' + - 'nfydgfziuvhete6p3lrn4u325u' + - 'ntpiporu75bendibjku4kjmd5q' + - 'op6a5fclonhrxm3zlo6ub2tlw4' + - 'xsuxmk5dyba6rnkeslipxxdlzi' diff --git a/tests/files/fuzzy_release_matcher/5.yaml b/tests/files/fuzzy_release_matcher/5.yaml new file mode 100644 index 0000000..1eb435b --- /dev/null +++ b/tests/files/fuzzy_release_matcher/5.yaml @@ -0,0 +1,16 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Lissandrini" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 1 +expected: + - 'xfhjsixnlvbibigrilisqqvfk4' + - 'zfhfpo2shrdexpgd2as4fz7wnm' + - 'cyct2bqs5feqbowg6ovv53pdfq' diff --git a/tests/files/fuzzy_release_matcher/6.yaml b/tests/files/fuzzy_release_matcher/6.yaml new file mode 100644 index 0000000..ae52b23 --- /dev/null +++ b/tests/files/fuzzy_release_matcher/6.yaml @@ -0,0 +1,24 @@ +about: contrib, year +input: > + { + "contribs": [ + { + "raw_name": "Goodwin" + } + ], + "release_year": 2014, + "ext_ids": {} + } +release_year_padding: 0 +expected: + - 2bbtr4cltbgannqc6vqijvvzdq + - 34i2hba6tzf3xomobhumfkkvga + - 62sz5fhhuvenpfctf6wejl5m2i + - chnqmdm4yfd4zk6kawujvsbhwy + - chs7be23vfdthk3xre54w534zm + - f5lp3nipazhyxoa2xarlomkofm + - hikujb5wmvasnoat2myt56l63y + - qbom7rwqtzfypa5hltgbx4e2iq + - qh44drz3bvg2ndzwzc55xops7y + - r4n57quetbf7tddwodjauegmzq + diff --git a/tests/files/fuzzy_release_matcher/7.yaml b/tests/files/fuzzy_release_matcher/7.yaml new file mode 100644 index 0000000..2330f0d --- /dev/null +++ b/tests/files/fuzzy_release_matcher/7.yaml @@ -0,0 +1,10 @@ +about: just a subtitle +input: > + { + "subtitle": "topographies parisiennes", + "ext_ids": {} + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/fuzzy_release_matcher/8.yaml b/tests/files/fuzzy_release_matcher/8.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/fuzzy_release_matcher/8.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/fuzzy_release_matcher/9.yaml b/tests/files/fuzzy_release_matcher/9.yaml new file mode 100644 index 0000000..b43e53a --- /dev/null +++ b/tests/files/fuzzy_release_matcher/9.yaml @@ -0,0 +1,139 @@ +about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu +input: > + { + "abstracts": [], + "refs": [], + "contribs": [ + { + "index": 0, + "raw_name": "Annelies Schulte Nordholt", + "role": "author", + "extra": { + "seq": "first" + } + } + ], + "publisher": "Uopen Journals", + "pages": "66", + "ext_ids": { + "doi": "10.18352/bmgn-lchr.128" + }, + "release_year": 2008, + "release_date": "2008-02-19", + "release_stage": "published", + "release_type": "article-journal", + "container_id": "sm7svbj64vc55gj4p23t7c3lrm", + "webcaptures": [], + "filesets": [], + "files": [ + { + "release_ids": [ + "yvqtz2zvkzcbpj4jxrp7bvydfu" + ], + "mimetype": "application/pdf", + "urls": [ + { + "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "publisher" + }, + { + "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", + "rel": "webarchive" + } + ], + "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", + "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", + "md5": "7dae3ec6c1d65cae6a91554071cc9625", + "size": 889420, + "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", + "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", + "state": "active" + } + ], + "container": { + "wikidata_qid": "Q15763709", + "issnp": "1873-5045", + "issne": "1873-5045", + "issnl": "1873-5045", + "publisher": "Uopen Journals", + "name": "Relief: Revue Électronique de Littérature Francaise", + "extra": { + "country": "nl", + "default_license": "CC-BY", + "doaj": { + "as_of": "2021-11-20", + "default_license": "CC-BY", + "seal": false + }, + "kbart": { + "clockss": { + "year_spans": [ + [ + 2007, + 2016 + ] + ] + }, + "lockss": { + "year_spans": [ + [ + 2007, + 2019 + ] + ] + }, + "pkp_pln": { + "year_spans": [ + [ + 2007, + 2021 + ] + ] + }, + "portico": { + "year_spans": [ + [ + 2007, + 2017 + ] + ] + } + }, + "languages": [ + "en" + ], + "publisher_type": "unipress", + "road": { + "as_of": "2018-01-24" + }, + "sherpa_romeo": { + "color": "blue" + }, + "szczepanski": { + "as_of": "2018" + }, + "urls": [ + "https://www.revue-relief.org/", + "http://www.revue-relief.org/index.php/relief", + "http://www.revue-relief.org/index.php/relief/about" + ] + }, + "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", + "ident": "sm7svbj64vc55gj4p23t7c3lrm", + "state": "active" + }, + "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", + "title": "Georges Perec: topographies parisiennes du flâneur", + "state": "active", + "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", + "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", + "extra": { + "crossref": { + "type": "journal-article" + } + } + } +release_year_padding: 1 +expected: + - yvqtz2zvkzcbpj4jxrp7bvydfu + - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/0.yaml b/tests/files/simple_fuzzy_release_matcher/0.yaml deleted file mode 100644 index 71fc992..0000000 --- a/tests/files/simple_fuzzy_release_matcher/0.yaml +++ /dev/null @@ -1,16 +0,0 @@ -about: title and contrib -input: > - { - "contribs": [ - { - "raw_name": "Michael Adams" - } - ], - "title": "digital libraries", - "ext_ids": {} - } -release_year_padding: 1 -expected: - - 7rmvqtrb2jdyhcxxodihzzcugy - - upm5nljirrbsfenoyxsisciltq - - wd3oeoi3bffknfbg2ymleqc4ja diff --git a/tests/files/simple_fuzzy_release_matcher/1.yaml b/tests/files/simple_fuzzy_release_matcher/1.yaml deleted file mode 100644 index df6a954..0000000 --- a/tests/files/simple_fuzzy_release_matcher/1.yaml +++ /dev/null @@ -1,24 +0,0 @@ -about: title contrib, partial name -input: > - { - "contribs": [ - { - "raw_name": "Adams" - } - ], - "title": "digital libraries", - "ext_ids": {} - } -release_year_padding: 1 -expected: - - 7rmvqtrb2jdyhcxxodihzzcugy - - a2u6ougtsjcbvczou6sazsulcm - - dy45vilej5diros6zmax46nm4e - - exuwhhayird4fdjmmsiqpponlq - - gqrj7jikezgcfpjfazhpf4e7c4 - - mkmqt3453relbpuyktnmsg6hjq - - t2g5sl3dgzchtnq7dynxyzje44 - - t4tvenhrvzamraxrvvxivxmvga - - wd3oeoi3bffknfbg2ymleqc4ja - - y63a6dhrfnb7bltlxfynydbojy - diff --git a/tests/files/simple_fuzzy_release_matcher/2.yaml b/tests/files/simple_fuzzy_release_matcher/2.yaml deleted file mode 100644 index df6a954..0000000 --- a/tests/files/simple_fuzzy_release_matcher/2.yaml +++ /dev/null @@ -1,24 +0,0 @@ -about: title contrib, partial name -input: > - { - "contribs": [ - { - "raw_name": "Adams" - } - ], - "title": "digital libraries", - "ext_ids": {} - } -release_year_padding: 1 -expected: - - 7rmvqtrb2jdyhcxxodihzzcugy - - a2u6ougtsjcbvczou6sazsulcm - - dy45vilej5diros6zmax46nm4e - - exuwhhayird4fdjmmsiqpponlq - - gqrj7jikezgcfpjfazhpf4e7c4 - - mkmqt3453relbpuyktnmsg6hjq - - t2g5sl3dgzchtnq7dynxyzje44 - - t4tvenhrvzamraxrvvxivxmvga - - wd3oeoi3bffknfbg2ymleqc4ja - - y63a6dhrfnb7bltlxfynydbojy - diff --git a/tests/files/simple_fuzzy_release_matcher/3.yaml b/tests/files/simple_fuzzy_release_matcher/3.yaml deleted file mode 100644 index 1ab761b..0000000 --- a/tests/files/simple_fuzzy_release_matcher/3.yaml +++ /dev/null @@ -1,19 +0,0 @@ -about: title only -input: > - { - "title": "The future of scholarly communications", - "ext_ids": {} - } -release_year_padding: 0 -expected: - - '2f57funqizf4lcxjanls45upom' - - '3p2hngx6kfa33bdaobipimdzhe' - - '75dzcdywlbb3logmrrpkabanfa' - - 'ccoocm7uzjgwnlpfk5fbwfudjm' - - 'nfydgfziuvhete6p3lrn4u325u' - - 'ntpiporu75bendibjku4kjmd5q' - - 'op6a5fclonhrxm3zlo6ub2tlw4' - - 'opoxzl3zzbccdh5tptm5p2krem' - - 'umzryrtocbakberuubjm2pgxum' - - 'zb4bjnwqsveyzcwebvvmnsoq7u' - diff --git a/tests/files/simple_fuzzy_release_matcher/4.yaml b/tests/files/simple_fuzzy_release_matcher/4.yaml deleted file mode 100644 index 9419406..0000000 --- a/tests/files/simple_fuzzy_release_matcher/4.yaml +++ /dev/null @@ -1,16 +0,0 @@ -about: title, year -input: > - { - "title": "The future of scholarly communications", - "release_year": 2014, - "ext_ids": {} - } -release_year_padding: 0 -expected: - - '66r4s55dpvht5jghwkhupai2km' - - 'ccoocm7uzjgwnlpfk5fbwfudjm' - - 'du4awowpsbbcjlo2pe6dvmxewu' - - 'nfydgfziuvhete6p3lrn4u325u' - - 'ntpiporu75bendibjku4kjmd5q' - - 'op6a5fclonhrxm3zlo6ub2tlw4' - - 'xsuxmk5dyba6rnkeslipxxdlzi' diff --git a/tests/files/simple_fuzzy_release_matcher/5.yaml b/tests/files/simple_fuzzy_release_matcher/5.yaml deleted file mode 100644 index 1eb435b..0000000 --- a/tests/files/simple_fuzzy_release_matcher/5.yaml +++ /dev/null @@ -1,16 +0,0 @@ -about: contrib, year -input: > - { - "contribs": [ - { - "raw_name": "Lissandrini" - } - ], - "release_year": 2014, - "ext_ids": {} - } -release_year_padding: 1 -expected: - - 'xfhjsixnlvbibigrilisqqvfk4' - - 'zfhfpo2shrdexpgd2as4fz7wnm' - - 'cyct2bqs5feqbowg6ovv53pdfq' diff --git a/tests/files/simple_fuzzy_release_matcher/6.yaml b/tests/files/simple_fuzzy_release_matcher/6.yaml deleted file mode 100644 index ae52b23..0000000 --- a/tests/files/simple_fuzzy_release_matcher/6.yaml +++ /dev/null @@ -1,24 +0,0 @@ -about: contrib, year -input: > - { - "contribs": [ - { - "raw_name": "Goodwin" - } - ], - "release_year": 2014, - "ext_ids": {} - } -release_year_padding: 0 -expected: - - 2bbtr4cltbgannqc6vqijvvzdq - - 34i2hba6tzf3xomobhumfkkvga - - 62sz5fhhuvenpfctf6wejl5m2i - - chnqmdm4yfd4zk6kawujvsbhwy - - chs7be23vfdthk3xre54w534zm - - f5lp3nipazhyxoa2xarlomkofm - - hikujb5wmvasnoat2myt56l63y - - qbom7rwqtzfypa5hltgbx4e2iq - - qh44drz3bvg2ndzwzc55xops7y - - r4n57quetbf7tddwodjauegmzq - diff --git a/tests/files/simple_fuzzy_release_matcher/7.yaml b/tests/files/simple_fuzzy_release_matcher/7.yaml deleted file mode 100644 index 2330f0d..0000000 --- a/tests/files/simple_fuzzy_release_matcher/7.yaml +++ /dev/null @@ -1,10 +0,0 @@ -about: just a subtitle -input: > - { - "subtitle": "topographies parisiennes", - "ext_ids": {} - } -release_year_padding: 1 -expected: - - yvqtz2zvkzcbpj4jxrp7bvydfu - - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/8.yaml b/tests/files/simple_fuzzy_release_matcher/8.yaml deleted file mode 100644 index b43e53a..0000000 --- a/tests/files/simple_fuzzy_release_matcher/8.yaml +++ /dev/null @@ -1,139 +0,0 @@ -about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu -input: > - { - "abstracts": [], - "refs": [], - "contribs": [ - { - "index": 0, - "raw_name": "Annelies Schulte Nordholt", - "role": "author", - "extra": { - "seq": "first" - } - } - ], - "publisher": "Uopen Journals", - "pages": "66", - "ext_ids": { - "doi": "10.18352/bmgn-lchr.128" - }, - "release_year": 2008, - "release_date": "2008-02-19", - "release_stage": "published", - "release_type": "article-journal", - "container_id": "sm7svbj64vc55gj4p23t7c3lrm", - "webcaptures": [], - "filesets": [], - "files": [ - { - "release_ids": [ - "yvqtz2zvkzcbpj4jxrp7bvydfu" - ], - "mimetype": "application/pdf", - "urls": [ - { - "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", - "rel": "publisher" - }, - { - "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", - "rel": "webarchive" - } - ], - "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", - "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", - "md5": "7dae3ec6c1d65cae6a91554071cc9625", - "size": 889420, - "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", - "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", - "state": "active" - } - ], - "container": { - "wikidata_qid": "Q15763709", - "issnp": "1873-5045", - "issne": "1873-5045", - "issnl": "1873-5045", - "publisher": "Uopen Journals", - "name": "Relief: Revue Électronique de Littérature Francaise", - "extra": { - "country": "nl", - "default_license": "CC-BY", - "doaj": { - "as_of": "2021-11-20", - "default_license": "CC-BY", - "seal": false - }, - "kbart": { - "clockss": { - "year_spans": [ - [ - 2007, - 2016 - ] - ] - }, - "lockss": { - "year_spans": [ - [ - 2007, - 2019 - ] - ] - }, - "pkp_pln": { - "year_spans": [ - [ - 2007, - 2021 - ] - ] - }, - "portico": { - "year_spans": [ - [ - 2007, - 2017 - ] - ] - } - }, - "languages": [ - "en" - ], - "publisher_type": "unipress", - "road": { - "as_of": "2018-01-24" - }, - "sherpa_romeo": { - "color": "blue" - }, - "szczepanski": { - "as_of": "2018" - }, - "urls": [ - "https://www.revue-relief.org/", - "http://www.revue-relief.org/index.php/relief", - "http://www.revue-relief.org/index.php/relief/about" - ] - }, - "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", - "ident": "sm7svbj64vc55gj4p23t7c3lrm", - "state": "active" - }, - "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", - "title": "Georges Perec: topographies parisiennes du flâneur", - "state": "active", - "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", - "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", - "extra": { - "crossref": { - "type": "journal-article" - } - } - } -release_year_padding: 1 -expected: - - yvqtz2zvkzcbpj4jxrp7bvydfu - - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/files/simple_fuzzy_release_matcher/9.yaml b/tests/files/simple_fuzzy_release_matcher/9.yaml deleted file mode 100644 index b43e53a..0000000 --- a/tests/files/simple_fuzzy_release_matcher/9.yaml +++ /dev/null @@ -1,139 +0,0 @@ -about: a full document, https://fatcat.wiki/release/yvqtz2zvkzcbpj4jxrp7bvydfu -input: > - { - "abstracts": [], - "refs": [], - "contribs": [ - { - "index": 0, - "raw_name": "Annelies Schulte Nordholt", - "role": "author", - "extra": { - "seq": "first" - } - } - ], - "publisher": "Uopen Journals", - "pages": "66", - "ext_ids": { - "doi": "10.18352/bmgn-lchr.128" - }, - "release_year": 2008, - "release_date": "2008-02-19", - "release_stage": "published", - "release_type": "article-journal", - "container_id": "sm7svbj64vc55gj4p23t7c3lrm", - "webcaptures": [], - "filesets": [], - "files": [ - { - "release_ids": [ - "yvqtz2zvkzcbpj4jxrp7bvydfu" - ], - "mimetype": "application/pdf", - "urls": [ - { - "url": "https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", - "rel": "publisher" - }, - { - "url": "https://web.archive.org/web/20200209043715/https://www.revue-relief.org/articles/10.18352/relief.128/galley/159/download/", - "rel": "webarchive" - } - ], - "sha256": "96f3552fa3eee10282109dd994f6993caf44627946317d03862a5df167140b23", - "sha1": "a9ba7c2038e2a77ac1b1144344443a3835d83c40", - "md5": "7dae3ec6c1d65cae6a91554071cc9625", - "size": 889420, - "revision": "57e3b801-0d84-405b-be8b-6b2b0583cd75", - "ident": "oew6z4a6gvfqxc5kiy2r62ucfq", - "state": "active" - } - ], - "container": { - "wikidata_qid": "Q15763709", - "issnp": "1873-5045", - "issne": "1873-5045", - "issnl": "1873-5045", - "publisher": "Uopen Journals", - "name": "Relief: Revue Électronique de Littérature Francaise", - "extra": { - "country": "nl", - "default_license": "CC-BY", - "doaj": { - "as_of": "2021-11-20", - "default_license": "CC-BY", - "seal": false - }, - "kbart": { - "clockss": { - "year_spans": [ - [ - 2007, - 2016 - ] - ] - }, - "lockss": { - "year_spans": [ - [ - 2007, - 2019 - ] - ] - }, - "pkp_pln": { - "year_spans": [ - [ - 2007, - 2021 - ] - ] - }, - "portico": { - "year_spans": [ - [ - 2007, - 2017 - ] - ] - } - }, - "languages": [ - "en" - ], - "publisher_type": "unipress", - "road": { - "as_of": "2018-01-24" - }, - "sherpa_romeo": { - "color": "blue" - }, - "szczepanski": { - "as_of": "2018" - }, - "urls": [ - "https://www.revue-relief.org/", - "http://www.revue-relief.org/index.php/relief", - "http://www.revue-relief.org/index.php/relief/about" - ] - }, - "revision": "2f36f957-7b60-4452-9310-1bd5e0035c0e", - "ident": "sm7svbj64vc55gj4p23t7c3lrm", - "state": "active" - }, - "work_id": "qcpd2i2txfdi5emqb7fxsawk6e", - "title": "Georges Perec: topographies parisiennes du flâneur", - "state": "active", - "ident": "yvqtz2zvkzcbpj4jxrp7bvydfu", - "revision": "c9e80d74-8c4f-47a7-b49a-689f26856dff", - "extra": { - "crossref": { - "type": "journal-article" - } - } - } -release_year_padding: 1 -expected: - - yvqtz2zvkzcbpj4jxrp7bvydfu - - lttg27o7mjganpkhrgy3xyv7vu diff --git a/tests/test_matching.py b/tests/test_matching.py index b9d7fae..eb54751 100644 --- a/tests/test_matching.py +++ b/tests/test_matching.py @@ -55,7 +55,7 @@ def test_simple_fuzzy_release_matcher(es_client, caplog): """ matcher = FuzzyReleaseMatcher(es=es_client) Case = collections.namedtuple("Case", "about input release_year_padding expected") - cases = yaml_to_cases(Case, "tests/files/simple_fuzzy_release_matcher/*.yaml") + cases = yaml_to_cases(Case, "tests/files/fuzzy_release_matcher/*.yaml") for i, c in enumerate(cases): matcher.release_year_padding = c.release_year_padding entity = entity_from_json(c.input, ReleaseEntity) -- cgit v1.2.3