diff options
-rw-r--r-- | fuzzycat/verify.py | 13 | ||||
-rw-r--r-- | tests/data/release/nb4yakyqebalbatnnfijkfhmka | 43 | ||||
-rw-r--r-- | tests/data/release/pr7e4l5eibaavm3zsk62nmphni | 54 | ||||
-rw-r--r-- | tests/data/verify.csv | 9 | ||||
-rw-r--r-- | tests/test_verify.py | 9 |
5 files changed, 120 insertions, 8 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 0457e92..a223e48 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -138,6 +138,9 @@ class GroupVerifier: def compare(a, b): """ Compare two entities, return match status and reason. + + TODO: We might want a bunch of kwargs for things like year gap threshold + and the like. """ try: if glom(a, "ext_ids.doi") == glom(b, "ext_ids.doi"): @@ -215,7 +218,6 @@ def compare(a, b): if re.match(r"appendix ?[^ ]*$", a_title_lower): return (Status.AMBIGUOUS, Miss.APPENDIX) - try: # TODO: figshare versions, "xxx.v1" FIGSHARE_PREFIX = "10.6084/" @@ -314,6 +316,14 @@ def compare(a, b): a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ") b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ") + # https://fatcat.wiki/release/psykbwxylndtdaand2ymtkgzqu + # https://fatcat.wiki/release/xizkwvsodzajnn4u7lgeldqoum + if a_slug_title == b_slug_title: + a_year = a.get("release_year") + b_year = b.get("release_year") + if a_year and b_year and abs(a_year - b_year) > 40: + return (Status.DIFFERENT, Miss.YEAR) + if a_slug_title == b_slug_title: # via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi # https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily @@ -446,7 +456,6 @@ def compare(a, b): except PathAccessError: pass - return (Status.AMBIGUOUS, OK.DUMMY) diff --git a/tests/data/release/nb4yakyqebalbatnnfijkfhmka b/tests/data/release/nb4yakyqebalbatnnfijkfhmka new file mode 100644 index 0000000..445fb13 --- /dev/null +++ b/tests/data/release/nb4yakyqebalbatnnfijkfhmka @@ -0,0 +1,43 @@ +{ + "abstracts": [], + "container_id": "tol7woxlqjeg5bmzadeg6qrg3e", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Giacomo Rodano", + "role": "author" + }, + { + "index": 1, + "raw_name": "Nicolas Andre Benigno Serrano-Velarde", + "role": "author" + }, + { + "index": 2, + "raw_name": "Emanuele Tarantino", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.2139/ssrn.2645730" + }, + "extra": { + "crossref": { + "type": "journal-article" + } + }, + "ident": "nb4yakyqebalbatnnfijkfhmka", + "language": "en", + "publisher": "Elsevier BV", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2015, + "revision": "6352e747-a5aa-487f-9a6e-b7a50dc5e2b4", + "state": "active", + "title": "Bankruptcy Law and Bank Financing", + "work_id": "ctu7mu3bongg7pbrf5n3cernde" +} diff --git a/tests/data/release/pr7e4l5eibaavm3zsk62nmphni b/tests/data/release/pr7e4l5eibaavm3zsk62nmphni new file mode 100644 index 0000000..c27d39c --- /dev/null +++ b/tests/data/release/pr7e4l5eibaavm3zsk62nmphni @@ -0,0 +1,54 @@ +{ + "abstracts": [], + "container_id": "exrvddewvnb4pffd64fjacpoq4", + "contribs": [ + { + "extra": { + "seq": "first" + }, + "index": 0, + "raw_name": "Giacomo Rodano", + "role": "author" + }, + { + "index": 1, + "raw_name": "Nicolas Serrano-Velarde", + "role": "author" + }, + { + "index": 2, + "raw_name": "Emanuele Tarantino", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1016/j.jfineco.2016.01.016" + }, + "extra": { + "crossref": { + "alternative-id": [ + "S0304405X16000210" + ], + "subject": [ + "Strategy and Management", + "Economics and Econometrics", + "Accounting", + "Finance" + ], + "type": "journal-article" + } + }, + "ident": "pr7e4l5eibaavm3zsk62nmphni", + "language": "en", + "pages": "363-382", + "publisher": "Elsevier BV", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2016, + "revision": "2de55471-3503-4399-b790-2b2dc385c75f", + "state": "active", + "title": "Bankruptcy law and bank financing", + "volume": "120", + "work_id": "7uclti36yngurb374f4mcty2jy" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 72baebb..0f95905 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -8,7 +8,7 @@ mn26hwbmqvh23jhsecoder3ixq,544v67u75fazfp5qssqzmh6fta,Status.DIFFERENT,Miss.YEAR 4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.EXACT,OK.WORK_ID vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.WORK_ID kgeynply6vcxdeiluu6es6w72m,cm536ige6bfdfhhesp26ibfdva,Status.EXACT,OK.DATACITE_VERSION -knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY +knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.YEAR eo4qptzoqrholjslj7nemlne2y,zisq3tsezjcejinlpf7qgk6z2i,Status.DIFFERENT,Miss.YEAR crsd5c2fhvd7hodbd4trne3lgi,4547ybo5hvf4xhlh5triaccxai,Status.DIFFERENT,Miss.YEAR egxon2iqljf47c4stvacnccvwy,swuxb5owx5g4hff3c7ur5x3awy,Status.DIFFERENT,Miss.YEAR @@ -39,7 +39,7 @@ fmeud4dykjfudb5kjr2fgmaneq,iid2bnrjjbegtpgmpuppjou4k4,Status.DIFFERENT,Miss.SUBT zmivcpjvhba25ldkx27d24oefa,mjapiqe2nzcy3fs3hriw253dye,Status.STRONG,OK.FIGSHARE_VERSION lynlkp7wh5hn3mlpzcfz4faoqi,yrbvjd4xrjaq3jxt7pkheysclm,Status.DIFFERENT,Miss.YEAR t3vpox5wrvbgtcigp6a6o64oey,q5yaj5zbzjctzapb5bztzctsoe,Status.DIFFERENT,Miss.YEAR -65qtai5dmjb2hmkwa73nwafyhu,p4lk4tbohjat3g5nn5pb3kjdyu,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY +65qtai5dmjb2hmkwa73nwafyhu,p4lk4tbohjat3g5nn5pb3kjdyu,Status.DIFFERENT,Miss.YEAR fqtc2tonfbh7hlcwoxgxzqi4lu,ng7utp7murge3ksuzbtljf5bsq,Status.DIFFERENT,Miss.YEAR mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.STRONG,OK.DATACITE_RELATED_ID nqfv37as6bcohketfrhiuac2mq,ty6megtz35c3hep57bbx2cetja,Status.DIFFERENT,Miss.YEAR @@ -61,7 +61,7 @@ muk4xhjhubc3xn6qqddllgfsly,2gywie7yqfflnl6tljfo36keqi,Status.EXACT,OK.WORK_ID iywyis7npngxxbco6fgjrclrzy,anhsfjxg3few5nkfsvheehiebq,Status.DIFFERENT,Miss.BOOK_CHAPTER rk7mn5uaqjaslgcxc2nl6ijpaq,td3rnxzbxzeslj6ijoce3mtxcq,Status.EXACT,OK.WORK_ID ohkfrjjcxfcavoqoqt52wi6eke,egufgu3yubgthex3y7fdt7uupa,Status.DIFFERENT,Miss.DATASET_DOI -dklwsz4w3rdlfddif4pcxb6ngm,wsbinmv7lragjnaedbgws6bztm,Status.AMBIGUOUS,OK.DUMMY +dklwsz4w3rdlfddif4pcxb6ngm,wsbinmv7lragjnaedbgws6bztm,Status.DIFFERENT,Miss.YEAR jizydliu2vclvpdtcrajlvuq2m,3g6mdd3tvjabdaez6mwcycso3q,Status.EXACT,OK.WORK_ID fvrscdvsznb4zlhuadd6ar7ot4,57la45yryjd73gav22bnl4lyni,Status.STRONG,OK.FIGSHARE_VERSION 6fedywjyynbxhdqv3etxjuqhba,gls2x7ca4nhzrkf437gdnj6ekq,Status.DIFFERENT,Miss.YEAR @@ -76,7 +76,7 @@ hqwrsqnzdjbqhbrqnsbooohqse,ydx2wolhvffxnb6as6gekmocx4,Status.EXACT,OK.WORK_ID vz7q453kr5ds3ptsldwxedbiii,2wzybzqlmjhjfh75cxjohbvzi4,Status.DIFFERENT,Miss.RELEASE_TYPE efumvvpw6jbb7ehp2qfdatgxzy,funn7cwjbrgefji27tzpl4avuu,Status.EXACT,OK.WORK_ID pjvosq3ulzeb5d6w7zijrbz75y,pxkm2asxjnflzkdi5qnfd5fpt4,Status.DIFFERENT,Miss.BOOK_CHAPTER -ji3qg5sajndt7p54u7wumqsjye,hxau2e34bnhhbeucfdrncgmcby,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY +ji3qg5sajndt7p54u7wumqsjye,hxau2e34bnhhbeucfdrncgmcby,Status.DIFFERENT,Miss.YEAR 2gpvznjjcfbmhats6ot2vsodju,qk6arua2snaobfvdvlfvjp3yeq,Status.AMBIGUOUS, 75ky5xniobchzbhzwhmwhu5uoa,uvgwfvwnnbg7xchy63bloyrwvi,Status.STRONG, jdtngtiz3bdqboypujoni2x3ry,byh7xr5qhjca3bw53ivdotck3e,Status.EXACT, @@ -115,3 +115,4 @@ s5hm65waingwjmgf3plu76hzu4,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS, zlywxoy7cfexvaatziqp4ip5m4,phqelg6oc5hs5dehhgmodcnh5u,Status.EXACT,OK.DATACITE_VERSION vqjpcuqxnbhdtelzspxjmklm7u,knuzh5bcqbg7ph7ffvqaiwevti,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW psykbwxylndtdaand2ymtkgzqu,xizkwvsodzajnn4u7lgeldqoum,Status.DIFFERENT,Miss.YEAR +nb4yakyqebalbatnnfijkfhmka,pr7e4l5eibaavm3zsk62nmphni,Status.STRONG, diff --git a/tests/test_verify.py b/tests/test_verify.py index 533a5ba..75b1190 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -46,8 +46,13 @@ def test_compare(): .format(a=a, b=b, base=FATCAT_BASE_URL, status=status, reason=reason)) continue assert status_mapping[ - expected_status] == status, "status: want {}, got {} {} for {} {}".format( - expected_status, status, reason, a, b) + expected_status] == status, "status: want {expected_status}, got {status} {reason} for {base}release/{a} {base}release/{b}".format( + expected_status=expected_status, + status=status, + reason=reason, + base=FATCAT_BASE_URL, + a=a, + b=b) if expected_reason: assert expected_reason.lower() == reason.lower( ), "reason [{base}release/{a} {base}release/{b}]: want {reason}, got {expected_reason}".format( |