From 1b0309499ee59384eba074709a3440019141a31e Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 4 Dec 2020 00:52:01 +0100 Subject: update cases; ok.work_id --- fuzzycat/common.py | 1 + fuzzycat/verify.py | 3 +++ notes/2020_11_testruns.md | 13 ++++------ tests/data/release/psykbwxylndtdaand2ymtkgzqu | 35 +++++++++++++++++++++++++++ tests/data/release/xizkwvsodzajnn4u7lgeldqoum | 21 ++++++++++++++++ tests/data/verify.csv | 35 ++++++++++++++------------- 6 files changed, 83 insertions(+), 25 deletions(-) create mode 100644 tests/data/release/psykbwxylndtdaand2ymtkgzqu create mode 100644 tests/data/release/xizkwvsodzajnn4u7lgeldqoum diff --git a/fuzzycat/common.py b/fuzzycat/common.py index bbffcc9..f03b7cc 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -28,6 +28,7 @@ class OK(str, Enum): SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' TITLE_AUTHOR_MATCH = 'ok.title_author_match' TOKENIZED_AUTHORS = 'ok.tokenized_authors' + WORK_ID = 'ok.work_id' class Miss(str, Enum): diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index b385c25..0457e92 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -145,6 +145,9 @@ def compare(a, b): except PathAccessError: pass + if a.get("work_id") and a.get("work_id") == b.get("work_id"): + return (Status.EXACT, OK.WORK_ID) + a_title = a.get("title", "") a_title_lower = a_title.lower() b_title = b.get("title", "") diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md index 6e36bdc..1abf2da 100644 --- a/notes/2020_11_testruns.md +++ b/notes/2020_11_testruns.md @@ -90,16 +90,13 @@ Preliminary case distribution: ## Case Mining -* [ ] https://fatcat.wiki/release/3jnis3ebrfgcdmdaa4aunc7xfi https://fatcat.wiki/release/wb3qvo27irfohmo3pa3aatpooa Status.AMBIGUOUS OK.DUMMY +> "-" ignore, "x" done -Too few hints. - -* [ ] https://fatcat.wiki/release/byrshkihwjfmplsv3ozbmpsz64 https://fatcat.wiki/release/fpll6q4ebvfgvonwi4vvetzjlq Status.AMBIGUOUS OK.DUMMY - -Too few hints. - -* [ ] https://fatcat.wiki/release/vqjpcuqxnbhdtelzspxjmklm7u https://fatcat.wiki/release/knuzh5bcqbg7ph7ffvqaiwevti Status.AMBIGUOUS OK.DUMMY +* [-] https://fatcat.wiki/release/3jnis3ebrfgcdmdaa4aunc7xfi https://fatcat.wiki/release/wb3qvo27irfohmo3pa3aatpooa Status.AMBIGUOUS OK.DUMMY +* [-] https://fatcat.wiki/release/byrshkihwjfmplsv3ozbmpsz64 https://fatcat.wiki/release/fpll6q4ebvfgvonwi4vvetzjlq Status.AMBIGUOUS OK.DUMMY +* [x] https://fatcat.wiki/release/vqjpcuqxnbhdtelzspxjmklm7u https://fatcat.wiki/release/knuzh5bcqbg7ph7ffvqaiwevti Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/psykbwxylndtdaand2ymtkgzqu https://fatcat.wiki/release/xizkwvsodzajnn4u7lgeldqoum Status.AMBIGUOUS OK.DUMMY + * [ ] https://fatcat.wiki/release/in2mm2wafbczjgzlapq55rrksq https://fatcat.wiki/release/oaezupjwnfckxaajjhjb3fl42e Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/u4mjilmo75bcnjyms564l66jea https://fatcat.wiki/release/6ofr4mqnmrdy3nyyh5ufm5ats4 Status.AMBIGUOUS OK.DUMMY * [ ] https://fatcat.wiki/release/2qcjbknhyrhh5dbuxobjy3gmqm https://fatcat.wiki/release/r6znetafszbuvaevbasn7ezsk4 Status.AMBIGUOUS OK.DUMMY diff --git a/tests/data/release/psykbwxylndtdaand2ymtkgzqu b/tests/data/release/psykbwxylndtdaand2ymtkgzqu new file mode 100644 index 0000000..5a91126 --- /dev/null +++ b/tests/data/release/psykbwxylndtdaand2ymtkgzqu @@ -0,0 +1,35 @@ +{ + "abstracts": [], + "container_id": "nzsu5tiiqzdajk5fqisig3k6du", + "contribs": [], + "ext_ids": { + "doi": "10.5694/j.1326-5377.1916.tb117279.x" + }, + "extra": { + "crossref": { + "alternative-id": [ + "10.5694/j.1326-5377.1916.tb117279.x" + ], + "archive": [ + "Portico" + ], + "subject": [ + "General Medicine" + ], + "type": "journal-article" + } + }, + "ident": "psykbwxylndtdaand2ymtkgzqu", + "issue": "11", + "language": "en", + "pages": "218-219", + "publisher": "AMPCo", + "refs": [], + "release_stage": "published", + "release_type": "article-journal", + "release_year": 1916, + "revision": "ff37e91c-b10b-46eb-968f-aabfc0bcfdc2", + "state": "active", + "title": "JAMES JAMIESON", + "work_id": "7xwfqtxbxbhxbjorxgatseqhju" +} diff --git a/tests/data/release/xizkwvsodzajnn4u7lgeldqoum b/tests/data/release/xizkwvsodzajnn4u7lgeldqoum new file mode 100644 index 0000000..c4ec377 --- /dev/null +++ b/tests/data/release/xizkwvsodzajnn4u7lgeldqoum @@ -0,0 +1,21 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.14288/1.0027724" + }, + "extra": { + "datacite": {} + }, + "ident": "xizkwvsodzajnn4u7lgeldqoum", + "language": "en", + "publisher": "The University of British Columbia", + "refs": [], + "release_stage": "published", + "release_type": "article", + "release_year": 1967, + "revision": "9d050008-e82f-4474-9448-aca4ac910180", + "state": "active", + "title": "James Jamieson", + "work_id": "z3zqk35jifhvnc27lajpatnt4a" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index ae8ea80..72baebb 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -1,12 +1,12 @@ 7kzrmoajzzedxgdvbltgqihszu,bd4crw4p7ber7pzhpoyw2c77bi,Status.STRONG,OK.DATACITE_RELATED_ID foddwpevbjao3b3uwccvtuxfi4,versjalccvgdtp3q25elgy2z7a,Status.DIFFERENT,Miss.DATASET_DOI -v2ypxs2yrbh57cdo6lfuiik64e,6zzx36tlefdtbftzpg4wtump3e,Status.STRONG,OK.ARXIV_VERSION -hdvg6m467bhyng4l7xauk4ymoa,f5fugxp3qze2fht2uxt3xivi4i,Status.STRONG,OK.PREPRINT_PUBLISHED +v2ypxs2yrbh57cdo6lfuiik64e,6zzx36tlefdtbftzpg4wtump3e,Status.EXACT,OK.WORK_ID +hdvg6m467bhyng4l7xauk4ymoa,f5fugxp3qze2fht2uxt3xivi4i,Status.EXACT,OK.WORK_ID cubz67ifbvacppya3i27yiwr2q,4ojllezvyfehnpnj2pil2h2pdu,Status.EXACT,OK.TITLE_AUTHOR_MATCH s46mfwvb4rdyhlforb6yxg3abi,5hvdhbszafhw5fbu4jnrmesdmu,Status.DIFFERENT,Miss.BOOK_CHAPTER mn26hwbmqvh23jhsecoder3ixq,544v67u75fazfp5qssqzmh6fta,Status.DIFFERENT,Miss.YEAR -4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.STRONG,OK.ARXIV_VERSION -vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.TITLE_AUTHOR_MATCH +4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.EXACT,OK.WORK_ID +vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.WORK_ID kgeynply6vcxdeiluu6es6w72m,cm536ige6bfdfhhesp26ibfdva,Status.EXACT,OK.DATACITE_VERSION knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY eo4qptzoqrholjslj7nemlne2y,zisq3tsezjcejinlpf7qgk6z2i,Status.DIFFERENT,Miss.YEAR @@ -17,15 +17,15 @@ c2pranaprjhrxk7x5euws32cg4,liarb7xuizewdafcubg2z3dwou,Status.DIFFERENT,Miss.CONT tyokc7ccfjaw5nimkkl32dl6ta,gyyxomlfkzfannusvzoypbnel4,Status.AMBIGUOUS,Miss.BLACKLISTED 2wakwcyb2zhbla2aao3g6ajfli,dryvgf7v3jeergr3gendplglqq,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY zvwqju7e3zhf7jpbtoejfe3i4y,fpj5eqgiunfpjn7qkffwvpre5e,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY -cwfhdsdr6nbtngqwsqpafqj72u,icrvubkwprh6fl2irtrxziqqai,Status.STRONG,OK.ARXIV_VERSION +cwfhdsdr6nbtngqwsqpafqj72u,icrvubkwprh6fl2irtrxziqqai,Status.EXACT,OK.WORK_ID qlkjwemcrzcpjeeecduiunghui,chejpgnhebcx7of4d4dkuqhkne,Status.DIFFERENT,Miss.YEAR no7a4vrfwnfp7jqrliq6n2hpxi,rscsor4cl5fydedr2jb6o7k4zi,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY mxfrtcc3njeh5dscwgzhrugzsq,x7lbkuc5afb75nz5l5kyrzy2ia,Status.DIFFERENT,Miss.YEAR -cqkm3hyn3rgcng3d3alwtciwpq,unwrwze6znf5xouud35i3jlneq,Status.STRONG,OK.PREPRINT_PUBLISHED +cqkm3hyn3rgcng3d3alwtciwpq,unwrwze6znf5xouud35i3jlneq,Status.EXACT,OK.WORK_ID fzs6y277zbgxnbcsmmfnftyqgy,b2ggrb2mpvh4namvf6mht5nnaq,Status.DIFFERENT,Miss.YEAR qgvu7i5eqrakpcnantqikaxpbu,kafrljfrv5favpvbgxavobh46y,Status.AMBIGUOUS,Miss.SHORT_TITLE qbfao6tzh5gkxaqaqwmidpme3q,whyzodcvtzehjdvj5ezvbkda34,Status.DIFFERENT,Miss.SUBTITLE -ml7eci5bmnc4zl6fc6vzscciwu,rsjv7rxzuzdptmfn7orwxr7n6q,Status.STRONG,OK.ARXIV_VERSION +ml7eci5bmnc4zl6fc6vzscciwu,rsjv7rxzuzdptmfn7orwxr7n6q,Status.EXACT,OK.WORK_ID 3mup7xynsfdpne3rtp274lmwdy,pbhkek57zrddnllui7pl4vjhai,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY lr5emu7qpfdmve6jcfjlrgoi64,revp263aa5dnjft72qynhzjcvi,Status.DIFFERENT,Miss.YEAR bvu4qrzfvfdhxpvl4k2ertxkbe,qnteujy54vflrnjtq2k4wtrabq,Status.DIFFERENT,Miss.YEAR @@ -34,7 +34,7 @@ apbr2crzrfamhdqt35c3sgkld4,fwhmikkv7rcjdp6j6vmroggncy,Status.STRONG,OK.SLUG_TITL 3x5gxfal75geppn22rck3bdanm,fpjygddf7bgahaaabjl2d67m4i,Status.EXACT,OK.TITLE_AUTHOR_MATCH mkqmxbrhozhxphemdgshl57m3u,ahlp3vywzzb5fh5tbjskaym3ri,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY 2hquztvjlrai3frazkmb6icgzy,ygkmoig5fjhtbg3rcobuy67pnu,Status.AMBIGUOUS,Miss.SHORT_TITLE -uzrpjthgpbb2hhacohndcgj3qm,gxbp2vmubnhgrhfobb7wceujvm,Status.STRONG,OK.ARXIV_VERSION +uzrpjthgpbb2hhacohndcgj3qm,gxbp2vmubnhgrhfobb7wceujvm,Status.EXACT,OK.WORK_ID fmeud4dykjfudb5kjr2fgmaneq,iid2bnrjjbegtpgmpuppjou4k4,Status.DIFFERENT,Miss.SUBTITLE zmivcpjvhba25ldkx27d24oefa,mjapiqe2nzcy3fs3hriw253dye,Status.STRONG,OK.FIGSHARE_VERSION lynlkp7wh5hn3mlpzcfz4faoqi,yrbvjd4xrjaq3jxt7pkheysclm,Status.DIFFERENT,Miss.YEAR @@ -43,38 +43,38 @@ t3vpox5wrvbgtcigp6a6o64oey,q5yaj5zbzjctzapb5bztzctsoe,Status.DIFFERENT,Miss.YEAR fqtc2tonfbh7hlcwoxgxzqi4lu,ng7utp7murge3ksuzbtljf5bsq,Status.DIFFERENT,Miss.YEAR mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.STRONG,OK.DATACITE_RELATED_ID nqfv37as6bcohketfrhiuac2mq,ty6megtz35c3hep57bbx2cetja,Status.DIFFERENT,Miss.YEAR -cedhaxcvkrddpeedqtaxln4zsq,5hzpesjrjrdrzaoahvihorp7eq,Status.STRONG,OK.PREPRINT_PUBLISHED +cedhaxcvkrddpeedqtaxln4zsq,5hzpesjrjrdrzaoahvihorp7eq,Status.EXACT,OK.WORK_ID wwiarqhsgbevdc74f6i4qmvyhy,d35gplnuibe6djfhnh42o66zbm,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY arzle77ezbbz5e33ghpqlwjw5e,e6ism7bt2vf5jl4v2ffwy3gqvu,Status.DIFFERENT,Miss.SUBTITLE -yv3ihfy6pfe4xblrj7dcf3674u,tmewuet24jg5dflspneju2cot4,Status.STRONG,OK.ARXIV_VERSION +yv3ihfy6pfe4xblrj7dcf3674u,tmewuet24jg5dflspneju2cot4,Status.EXACT,OK.WORK_ID rh3r3fncmzaulfdfrjzv25tpli,7zp3azvi4vbxxob2cdyzm6pepa,Status.EXACT,OK.TITLE_AUTHOR_MATCH lf7w27ma2ncjjpwoy2kl22t77e,mgxkqlohmbhfpedxwg3s5jhrrq,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY libbt4mcwng3tiwcutfaxewmjy,6csob32ld5dx7h63cssqly6rfm,Status.DIFFERENT,Miss.YEAR -2r6dem2qanfttn73lezeislize,4iksfoith5b6zjarfihdtosr3e,Status.STRONG,OK.ARXIV_VERSION +2r6dem2qanfttn73lezeislize,4iksfoith5b6zjarfihdtosr3e,Status.EXACT,OK.WORK_ID wif435fwunfpfd46vvxo3at5ya,fy3j2l4s55b7ffltpiaic2jj7i,Status.DIFFERENT,Miss.YEAR huophilkpbh2ddemt7okzzkuyq,crle5axqrfhfdob464wlwhfrf4,Status.AMBIGUOUS,Miss.SHORT_TITLE dcq2jgd5abbjflzun4n3v6gjh4,ptovjgczrvft5fq2plyldafniq,Status.DIFFERENT,Miss.YEAR 7ah6efvk2ncjzgywch2cmtfumq,nj7v4e3cxbfybozjmdiuwqo4sm,Status.DIFFERENT,Miss.RELEASE_TYPE eu4xst6zx5atfj37mvwdm54opq,7b7vnb7bc5g5va4yk72ruajok4,Status.STRONG,OK.FIGSHARE_VERSION 6ovhnujfsff2nhnoeimjcckgta,qeujgsfrmvft7k7r474maekvua,Status.DIFFERENT,Miss.DATASET_DOI -muk4xhjhubc3xn6qqddllgfsly,2gywie7yqfflnl6tljfo36keqi,Status.STRONG,OK.ARXIV_VERSION +muk4xhjhubc3xn6qqddllgfsly,2gywie7yqfflnl6tljfo36keqi,Status.EXACT,OK.WORK_ID iywyis7npngxxbco6fgjrclrzy,anhsfjxg3few5nkfsvheehiebq,Status.DIFFERENT,Miss.BOOK_CHAPTER -rk7mn5uaqjaslgcxc2nl6ijpaq,td3rnxzbxzeslj6ijoce3mtxcq,Status.STRONG,OK.ARXIV_VERSION +rk7mn5uaqjaslgcxc2nl6ijpaq,td3rnxzbxzeslj6ijoce3mtxcq,Status.EXACT,OK.WORK_ID ohkfrjjcxfcavoqoqt52wi6eke,egufgu3yubgthex3y7fdt7uupa,Status.DIFFERENT,Miss.DATASET_DOI dklwsz4w3rdlfddif4pcxb6ngm,wsbinmv7lragjnaedbgws6bztm,Status.AMBIGUOUS,OK.DUMMY -jizydliu2vclvpdtcrajlvuq2m,3g6mdd3tvjabdaez6mwcycso3q,Status.STRONG,OK.PREPRINT_PUBLISHED +jizydliu2vclvpdtcrajlvuq2m,3g6mdd3tvjabdaez6mwcycso3q,Status.EXACT,OK.WORK_ID fvrscdvsznb4zlhuadd6ar7ot4,57la45yryjd73gav22bnl4lyni,Status.STRONG,OK.FIGSHARE_VERSION 6fedywjyynbxhdqv3etxjuqhba,gls2x7ca4nhzrkf437gdnj6ekq,Status.DIFFERENT,Miss.YEAR 7lepq6lyyfepdjat6ohpeqycdu,cfm6qhhxovferl2fahf6jmcsiu,Status.DIFFERENT,Miss.YEAR -ijbm7t2mpjcrrjazrmeli6b42a,7ijg4ar62rgo3olfbxltltrzc4,Status.EXACT,OK.TITLE_AUTHOR_MATCH +ijbm7t2mpjcrrjazrmeli6b42a,7ijg4ar62rgo3olfbxltltrzc4,Status.EXACT,OK.WORK_ID hyt2ebpmhjg53f5eu4v5zortfm,ceu2t7fapvg43bvyyqck344pei,Status.DIFFERENT,Miss.SUBTITLE uhih3c4gbzdtnciiqlfjx3w6le,lgga6cjz6bgo7cszpjfhpuoaqi,Status.DIFFERENT,Miss.YEAR 53w5pycrmvgglludwsv44m3czu,mvdjwqdvxfh3vd3zotf3gljm4a,Status.DIFFERENT,Miss.SUBTITLE 6vejogvunnbb7etjzu4yfs32mm,g53ggmce2rek5lw2l52oaimgiq,Status.DIFFERENT,Miss.SUBTITLE 325je3kjkjeerkchimvz6qxyji,ir7i7ldr7ffuvigvv6cvyyc7ju,Status.DIFFERENT,Miss.BOOK_CHAPTER -hqwrsqnzdjbqhbrqnsbooohqse,ydx2wolhvffxnb6as6gekmocx4,Status.STRONG,OK.ARXIV_VERSION +hqwrsqnzdjbqhbrqnsbooohqse,ydx2wolhvffxnb6as6gekmocx4,Status.EXACT,OK.WORK_ID vz7q453kr5ds3ptsldwxedbiii,2wzybzqlmjhjfh75cxjohbvzi4,Status.DIFFERENT,Miss.RELEASE_TYPE -efumvvpw6jbb7ehp2qfdatgxzy,funn7cwjbrgefji27tzpl4avuu,Status.STRONG,OK.ARXIV_VERSION +efumvvpw6jbb7ehp2qfdatgxzy,funn7cwjbrgefji27tzpl4avuu,Status.EXACT,OK.WORK_ID pjvosq3ulzeb5d6w7zijrbz75y,pxkm2asxjnflzkdi5qnfd5fpt4,Status.DIFFERENT,Miss.BOOK_CHAPTER ji3qg5sajndt7p54u7wumqsjye,hxau2e34bnhhbeucfdrncgmcby,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY 2gpvznjjcfbmhats6ot2vsodju,qk6arua2snaobfvdvlfvjp3yeq,Status.AMBIGUOUS, @@ -114,3 +114,4 @@ iwtrxnov2repzlgoi2at2md6tm,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS, s5hm65waingwjmgf3plu76hzu4,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS, zlywxoy7cfexvaatziqp4ip5m4,phqelg6oc5hs5dehhgmodcnh5u,Status.EXACT,OK.DATACITE_VERSION vqjpcuqxnbhdtelzspxjmklm7u,knuzh5bcqbg7ph7ffvqaiwevti,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW +psykbwxylndtdaand2ymtkgzqu,xizkwvsodzajnn4u7lgeldqoum,Status.DIFFERENT,Miss.YEAR -- cgit v1.2.3