From ba68fa4d91fbcd1dda3363b78bc24ca64ca2546b Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 2 Dec 2020 00:58:20 +0100 Subject: add case --- fuzzycat/common.py | 5 +++-- fuzzycat/verify.py | 7 +++++++ tests/data/release/he334wpbobegxhptpkvvrufioq | 24 ++++++++++++++++++++++++ tests/data/release/td3ouhgtzbbe7ctevfnldqkoba | 24 ++++++++++++++++++++++++ tests/data/verify.csv | 3 ++- 5 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 tests/data/release/he334wpbobegxhptpkvvrufioq create mode 100644 tests/data/release/td3ouhgtzbbe7ctevfnldqkoba diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 34508b7..5cef684 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -17,7 +17,10 @@ class OK(str, Enum): Reason for assuming we have a match. """ ARXIV_VERSION = 'ok.arxiv_version' + CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated' + CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv' DATACITE_RELATED_ID = 'ok.datacite_related_id' + DATACITE_VERSION = 'ok.datacite_version' DOI = 'ok.doi' DUMMY = 'ok.dummy' FIGSHARE_VERSION = 'ok.figshare_version' @@ -25,8 +28,6 @@ class OK(str, Enum): SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' TITLE_AUTHOR_MATCH = 'ok.title_author_match' TOKENIZED_AUTHORS = 'ok.tokenized_authors' - CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv' - CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated' class Miss(str, Enum): diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index e688d49..5977f8e 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -159,6 +159,13 @@ def compare(a, b): if fragment in a_title_lower: return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) + try: + if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom( + b, "extra.datacite.metadataVersion"): + return (Status.EXACT, OK.DATACITE_VERSION) + except PathAccessError: + pass + try: a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") diff --git a/tests/data/release/he334wpbobegxhptpkvvrufioq b/tests/data/release/he334wpbobegxhptpkvvrufioq new file mode 100644 index 0000000..2c72e00 --- /dev/null +++ b/tests/data/release/he334wpbobegxhptpkvvrufioq @@ -0,0 +1,24 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d88d0n1z" + }, + "extra": { + "datacite": { + "metadataVersion": 4 + }, + "release_month": 8 + }, + "ident": "he334wpbobegxhptpkvvrufioq", + "publisher": "Columbia University", + "refs": [], + "release_date": "2017-08-07", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "revision": "143a4dcf-eab2-4293-b079-d32769e4c550", + "state": "active", + "title": "Eastern questionnaire, answer sheet for Interviewee 51207, page 048", + "work_id": "r5gbob7wuzexdarwje6fiyefsu" +} diff --git a/tests/data/release/td3ouhgtzbbe7ctevfnldqkoba b/tests/data/release/td3ouhgtzbbe7ctevfnldqkoba new file mode 100644 index 0000000..42bebbf --- /dev/null +++ b/tests/data/release/td3ouhgtzbbe7ctevfnldqkoba @@ -0,0 +1,24 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d8hx24f7" + }, + "extra": { + "datacite": { + "metadataVersion": 3 + }, + "release_month": 8 + }, + "ident": "td3ouhgtzbbe7ctevfnldqkoba", + "publisher": "Columbia University", + "refs": [], + "release_date": "2017-08-08", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "revision": "8eab4949-e0e8-4339-9bb7-5d339e3d5639", + "state": "active", + "title": "Eastern questionnaire, answer sheet for Interviewee 51207, page 048", + "work_id": "ktmybzspenavxevth7artcbx7q" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 741cf15..874baa0 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -7,7 +7,7 @@ s46mfwvb4rdyhlforb6yxg3abi,5hvdhbszafhw5fbu4jnrmesdmu,Status.DIFFERENT,Miss.BOOK mn26hwbmqvh23jhsecoder3ixq,544v67u75fazfp5qssqzmh6fta,Status.DIFFERENT,Miss.YEAR 4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.STRONG,OK.ARXIV_VERSION vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.TITLE_AUTHOR_MATCH -kgeynply6vcxdeiluu6es6w72m,cm536ige6bfdfhhesp26ibfdva,Status.EXACT,OK.TITLE_AUTHOR_MATCH +kgeynply6vcxdeiluu6es6w72m,cm536ige6bfdfhhesp26ibfdva,Status.EXACT,OK.DATACITE_VERSION knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY eo4qptzoqrholjslj7nemlne2y,zisq3tsezjcejinlpf7qgk6z2i,Status.DIFFERENT,Miss.YEAR crsd5c2fhvd7hodbd4trne3lgi,4547ybo5hvf4xhlh5triaccxai,Status.DIFFERENT,Miss.YEAR @@ -104,3 +104,4 @@ omjj75lv4rayvcqmgmicnzf5ye,xxfujnvafrazbjw7kvh7bhmuvy,, 63g4ukdxajcqhdytqla6du3t3u,rz72bzfevzeofdeb342c6z45qu,Status.DIFFERENT,Miss.CUSTOM_PREFIX_10_14288 ij3yuoh6lrh3tkrv5o7gfk6yyi,tur236mqljdfdnlzbbnks2sily,Status.STRONG,OK.CUSTOM_IEEE_ARXIV neznj5fb4nf3tdqnotnbe34b6e,gcqdvvjiq5bphl7lpc4invi4vy,Status.STRONG,OK.CUSTOM_BSI_UNDATED +he334wpbobegxhptpkvvrufioq,td3ouhgtzbbe7ctevfnldqkoba,Status.EXACT,OK.DATACITE_VERSION -- cgit v1.2.3