aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/common.py1
-rw-r--r--fuzzycat/verify.py3
-rw-r--r--notes/2020_11_testruns.md13
-rw-r--r--tests/data/release/psykbwxylndtdaand2ymtkgzqu35
-rw-r--r--tests/data/release/xizkwvsodzajnn4u7lgeldqoum21
-rw-r--r--tests/data/verify.csv35
6 files changed, 83 insertions, 25 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py
index bbffcc9..f03b7cc 100644
--- a/fuzzycat/common.py
+++ b/fuzzycat/common.py
@@ -28,6 +28,7 @@ class OK(str, Enum):
SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
TOKENIZED_AUTHORS = 'ok.tokenized_authors'
+ WORK_ID = 'ok.work_id'
class Miss(str, Enum):
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index b385c25..0457e92 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -145,6 +145,9 @@ def compare(a, b):
except PathAccessError:
pass
+ if a.get("work_id") and a.get("work_id") == b.get("work_id"):
+ return (Status.EXACT, OK.WORK_ID)
+
a_title = a.get("title", "")
a_title_lower = a_title.lower()
b_title = b.get("title", "")
diff --git a/notes/2020_11_testruns.md b/notes/2020_11_testruns.md
index 6e36bdc..1abf2da 100644
--- a/notes/2020_11_testruns.md
+++ b/notes/2020_11_testruns.md
@@ -90,16 +90,13 @@ Preliminary case distribution:
## Case Mining
-* [ ] https://fatcat.wiki/release/3jnis3ebrfgcdmdaa4aunc7xfi https://fatcat.wiki/release/wb3qvo27irfohmo3pa3aatpooa Status.AMBIGUOUS OK.DUMMY
+> "-" ignore, "x" done
-Too few hints.
-
-* [ ] https://fatcat.wiki/release/byrshkihwjfmplsv3ozbmpsz64 https://fatcat.wiki/release/fpll6q4ebvfgvonwi4vvetzjlq Status.AMBIGUOUS OK.DUMMY
-
-Too few hints.
-
-* [ ] https://fatcat.wiki/release/vqjpcuqxnbhdtelzspxjmklm7u https://fatcat.wiki/release/knuzh5bcqbg7ph7ffvqaiwevti Status.AMBIGUOUS OK.DUMMY
+* [-] https://fatcat.wiki/release/3jnis3ebrfgcdmdaa4aunc7xfi https://fatcat.wiki/release/wb3qvo27irfohmo3pa3aatpooa Status.AMBIGUOUS OK.DUMMY
+* [-] https://fatcat.wiki/release/byrshkihwjfmplsv3ozbmpsz64 https://fatcat.wiki/release/fpll6q4ebvfgvonwi4vvetzjlq Status.AMBIGUOUS OK.DUMMY
+* [x] https://fatcat.wiki/release/vqjpcuqxnbhdtelzspxjmklm7u https://fatcat.wiki/release/knuzh5bcqbg7ph7ffvqaiwevti Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/psykbwxylndtdaand2ymtkgzqu https://fatcat.wiki/release/xizkwvsodzajnn4u7lgeldqoum Status.AMBIGUOUS OK.DUMMY
+
* [ ] https://fatcat.wiki/release/in2mm2wafbczjgzlapq55rrksq https://fatcat.wiki/release/oaezupjwnfckxaajjhjb3fl42e Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/u4mjilmo75bcnjyms564l66jea https://fatcat.wiki/release/6ofr4mqnmrdy3nyyh5ufm5ats4 Status.AMBIGUOUS OK.DUMMY
* [ ] https://fatcat.wiki/release/2qcjbknhyrhh5dbuxobjy3gmqm https://fatcat.wiki/release/r6znetafszbuvaevbasn7ezsk4 Status.AMBIGUOUS OK.DUMMY
diff --git a/tests/data/release/psykbwxylndtdaand2ymtkgzqu b/tests/data/release/psykbwxylndtdaand2ymtkgzqu
new file mode 100644
index 0000000..5a91126
--- /dev/null
+++ b/tests/data/release/psykbwxylndtdaand2ymtkgzqu
@@ -0,0 +1,35 @@
+{
+ "abstracts": [],
+ "container_id": "nzsu5tiiqzdajk5fqisig3k6du",
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.5694/j.1326-5377.1916.tb117279.x"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "10.5694/j.1326-5377.1916.tb117279.x"
+ ],
+ "archive": [
+ "Portico"
+ ],
+ "subject": [
+ "General Medicine"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "psykbwxylndtdaand2ymtkgzqu",
+ "issue": "11",
+ "language": "en",
+ "pages": "218-219",
+ "publisher": "AMPCo",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 1916,
+ "revision": "ff37e91c-b10b-46eb-968f-aabfc0bcfdc2",
+ "state": "active",
+ "title": "JAMES JAMIESON",
+ "work_id": "7xwfqtxbxbhxbjorxgatseqhju"
+}
diff --git a/tests/data/release/xizkwvsodzajnn4u7lgeldqoum b/tests/data/release/xizkwvsodzajnn4u7lgeldqoum
new file mode 100644
index 0000000..c4ec377
--- /dev/null
+++ b/tests/data/release/xizkwvsodzajnn4u7lgeldqoum
@@ -0,0 +1,21 @@
+{
+ "abstracts": [],
+ "contribs": [],
+ "ext_ids": {
+ "doi": "10.14288/1.0027724"
+ },
+ "extra": {
+ "datacite": {}
+ },
+ "ident": "xizkwvsodzajnn4u7lgeldqoum",
+ "language": "en",
+ "publisher": "The University of British Columbia",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article",
+ "release_year": 1967,
+ "revision": "9d050008-e82f-4474-9448-aca4ac910180",
+ "state": "active",
+ "title": "James Jamieson",
+ "work_id": "z3zqk35jifhvnc27lajpatnt4a"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index ae8ea80..72baebb 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -1,12 +1,12 @@
7kzrmoajzzedxgdvbltgqihszu,bd4crw4p7ber7pzhpoyw2c77bi,Status.STRONG,OK.DATACITE_RELATED_ID
foddwpevbjao3b3uwccvtuxfi4,versjalccvgdtp3q25elgy2z7a,Status.DIFFERENT,Miss.DATASET_DOI
-v2ypxs2yrbh57cdo6lfuiik64e,6zzx36tlefdtbftzpg4wtump3e,Status.STRONG,OK.ARXIV_VERSION
-hdvg6m467bhyng4l7xauk4ymoa,f5fugxp3qze2fht2uxt3xivi4i,Status.STRONG,OK.PREPRINT_PUBLISHED
+v2ypxs2yrbh57cdo6lfuiik64e,6zzx36tlefdtbftzpg4wtump3e,Status.EXACT,OK.WORK_ID
+hdvg6m467bhyng4l7xauk4ymoa,f5fugxp3qze2fht2uxt3xivi4i,Status.EXACT,OK.WORK_ID
cubz67ifbvacppya3i27yiwr2q,4ojllezvyfehnpnj2pil2h2pdu,Status.EXACT,OK.TITLE_AUTHOR_MATCH
s46mfwvb4rdyhlforb6yxg3abi,5hvdhbszafhw5fbu4jnrmesdmu,Status.DIFFERENT,Miss.BOOK_CHAPTER
mn26hwbmqvh23jhsecoder3ixq,544v67u75fazfp5qssqzmh6fta,Status.DIFFERENT,Miss.YEAR
-4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.STRONG,OK.ARXIV_VERSION
-vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.TITLE_AUTHOR_MATCH
+4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.EXACT,OK.WORK_ID
+vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.WORK_ID
kgeynply6vcxdeiluu6es6w72m,cm536ige6bfdfhhesp26ibfdva,Status.EXACT,OK.DATACITE_VERSION
knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
eo4qptzoqrholjslj7nemlne2y,zisq3tsezjcejinlpf7qgk6z2i,Status.DIFFERENT,Miss.YEAR
@@ -17,15 +17,15 @@ c2pranaprjhrxk7x5euws32cg4,liarb7xuizewdafcubg2z3dwou,Status.DIFFERENT,Miss.CONT
tyokc7ccfjaw5nimkkl32dl6ta,gyyxomlfkzfannusvzoypbnel4,Status.AMBIGUOUS,Miss.BLACKLISTED
2wakwcyb2zhbla2aao3g6ajfli,dryvgf7v3jeergr3gendplglqq,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
zvwqju7e3zhf7jpbtoejfe3i4y,fpj5eqgiunfpjn7qkffwvpre5e,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
-cwfhdsdr6nbtngqwsqpafqj72u,icrvubkwprh6fl2irtrxziqqai,Status.STRONG,OK.ARXIV_VERSION
+cwfhdsdr6nbtngqwsqpafqj72u,icrvubkwprh6fl2irtrxziqqai,Status.EXACT,OK.WORK_ID
qlkjwemcrzcpjeeecduiunghui,chejpgnhebcx7of4d4dkuqhkne,Status.DIFFERENT,Miss.YEAR
no7a4vrfwnfp7jqrliq6n2hpxi,rscsor4cl5fydedr2jb6o7k4zi,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
mxfrtcc3njeh5dscwgzhrugzsq,x7lbkuc5afb75nz5l5kyrzy2ia,Status.DIFFERENT,Miss.YEAR
-cqkm3hyn3rgcng3d3alwtciwpq,unwrwze6znf5xouud35i3jlneq,Status.STRONG,OK.PREPRINT_PUBLISHED
+cqkm3hyn3rgcng3d3alwtciwpq,unwrwze6znf5xouud35i3jlneq,Status.EXACT,OK.WORK_ID
fzs6y277zbgxnbcsmmfnftyqgy,b2ggrb2mpvh4namvf6mht5nnaq,Status.DIFFERENT,Miss.YEAR
qgvu7i5eqrakpcnantqikaxpbu,kafrljfrv5favpvbgxavobh46y,Status.AMBIGUOUS,Miss.SHORT_TITLE
qbfao6tzh5gkxaqaqwmidpme3q,whyzodcvtzehjdvj5ezvbkda34,Status.DIFFERENT,Miss.SUBTITLE
-ml7eci5bmnc4zl6fc6vzscciwu,rsjv7rxzuzdptmfn7orwxr7n6q,Status.STRONG,OK.ARXIV_VERSION
+ml7eci5bmnc4zl6fc6vzscciwu,rsjv7rxzuzdptmfn7orwxr7n6q,Status.EXACT,OK.WORK_ID
3mup7xynsfdpne3rtp274lmwdy,pbhkek57zrddnllui7pl4vjhai,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
lr5emu7qpfdmve6jcfjlrgoi64,revp263aa5dnjft72qynhzjcvi,Status.DIFFERENT,Miss.YEAR
bvu4qrzfvfdhxpvl4k2ertxkbe,qnteujy54vflrnjtq2k4wtrabq,Status.DIFFERENT,Miss.YEAR
@@ -34,7 +34,7 @@ apbr2crzrfamhdqt35c3sgkld4,fwhmikkv7rcjdp6j6vmroggncy,Status.STRONG,OK.SLUG_TITL
3x5gxfal75geppn22rck3bdanm,fpjygddf7bgahaaabjl2d67m4i,Status.EXACT,OK.TITLE_AUTHOR_MATCH
mkqmxbrhozhxphemdgshl57m3u,ahlp3vywzzb5fh5tbjskaym3ri,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
2hquztvjlrai3frazkmb6icgzy,ygkmoig5fjhtbg3rcobuy67pnu,Status.AMBIGUOUS,Miss.SHORT_TITLE
-uzrpjthgpbb2hhacohndcgj3qm,gxbp2vmubnhgrhfobb7wceujvm,Status.STRONG,OK.ARXIV_VERSION
+uzrpjthgpbb2hhacohndcgj3qm,gxbp2vmubnhgrhfobb7wceujvm,Status.EXACT,OK.WORK_ID
fmeud4dykjfudb5kjr2fgmaneq,iid2bnrjjbegtpgmpuppjou4k4,Status.DIFFERENT,Miss.SUBTITLE
zmivcpjvhba25ldkx27d24oefa,mjapiqe2nzcy3fs3hriw253dye,Status.STRONG,OK.FIGSHARE_VERSION
lynlkp7wh5hn3mlpzcfz4faoqi,yrbvjd4xrjaq3jxt7pkheysclm,Status.DIFFERENT,Miss.YEAR
@@ -43,38 +43,38 @@ t3vpox5wrvbgtcigp6a6o64oey,q5yaj5zbzjctzapb5bztzctsoe,Status.DIFFERENT,Miss.YEAR
fqtc2tonfbh7hlcwoxgxzqi4lu,ng7utp7murge3ksuzbtljf5bsq,Status.DIFFERENT,Miss.YEAR
mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.STRONG,OK.DATACITE_RELATED_ID
nqfv37as6bcohketfrhiuac2mq,ty6megtz35c3hep57bbx2cetja,Status.DIFFERENT,Miss.YEAR
-cedhaxcvkrddpeedqtaxln4zsq,5hzpesjrjrdrzaoahvihorp7eq,Status.STRONG,OK.PREPRINT_PUBLISHED
+cedhaxcvkrddpeedqtaxln4zsq,5hzpesjrjrdrzaoahvihorp7eq,Status.EXACT,OK.WORK_ID
wwiarqhsgbevdc74f6i4qmvyhy,d35gplnuibe6djfhnh42o66zbm,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
arzle77ezbbz5e33ghpqlwjw5e,e6ism7bt2vf5jl4v2ffwy3gqvu,Status.DIFFERENT,Miss.SUBTITLE
-yv3ihfy6pfe4xblrj7dcf3674u,tmewuet24jg5dflspneju2cot4,Status.STRONG,OK.ARXIV_VERSION
+yv3ihfy6pfe4xblrj7dcf3674u,tmewuet24jg5dflspneju2cot4,Status.EXACT,OK.WORK_ID
rh3r3fncmzaulfdfrjzv25tpli,7zp3azvi4vbxxob2cdyzm6pepa,Status.EXACT,OK.TITLE_AUTHOR_MATCH
lf7w27ma2ncjjpwoy2kl22t77e,mgxkqlohmbhfpedxwg3s5jhrrq,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
libbt4mcwng3tiwcutfaxewmjy,6csob32ld5dx7h63cssqly6rfm,Status.DIFFERENT,Miss.YEAR
-2r6dem2qanfttn73lezeislize,4iksfoith5b6zjarfihdtosr3e,Status.STRONG,OK.ARXIV_VERSION
+2r6dem2qanfttn73lezeislize,4iksfoith5b6zjarfihdtosr3e,Status.EXACT,OK.WORK_ID
wif435fwunfpfd46vvxo3at5ya,fy3j2l4s55b7ffltpiaic2jj7i,Status.DIFFERENT,Miss.YEAR
huophilkpbh2ddemt7okzzkuyq,crle5axqrfhfdob464wlwhfrf4,Status.AMBIGUOUS,Miss.SHORT_TITLE
dcq2jgd5abbjflzun4n3v6gjh4,ptovjgczrvft5fq2plyldafniq,Status.DIFFERENT,Miss.YEAR
7ah6efvk2ncjzgywch2cmtfumq,nj7v4e3cxbfybozjmdiuwqo4sm,Status.DIFFERENT,Miss.RELEASE_TYPE
eu4xst6zx5atfj37mvwdm54opq,7b7vnb7bc5g5va4yk72ruajok4,Status.STRONG,OK.FIGSHARE_VERSION
6ovhnujfsff2nhnoeimjcckgta,qeujgsfrmvft7k7r474maekvua,Status.DIFFERENT,Miss.DATASET_DOI
-muk4xhjhubc3xn6qqddllgfsly,2gywie7yqfflnl6tljfo36keqi,Status.STRONG,OK.ARXIV_VERSION
+muk4xhjhubc3xn6qqddllgfsly,2gywie7yqfflnl6tljfo36keqi,Status.EXACT,OK.WORK_ID
iywyis7npngxxbco6fgjrclrzy,anhsfjxg3few5nkfsvheehiebq,Status.DIFFERENT,Miss.BOOK_CHAPTER
-rk7mn5uaqjaslgcxc2nl6ijpaq,td3rnxzbxzeslj6ijoce3mtxcq,Status.STRONG,OK.ARXIV_VERSION
+rk7mn5uaqjaslgcxc2nl6ijpaq,td3rnxzbxzeslj6ijoce3mtxcq,Status.EXACT,OK.WORK_ID
ohkfrjjcxfcavoqoqt52wi6eke,egufgu3yubgthex3y7fdt7uupa,Status.DIFFERENT,Miss.DATASET_DOI
dklwsz4w3rdlfddif4pcxb6ngm,wsbinmv7lragjnaedbgws6bztm,Status.AMBIGUOUS,OK.DUMMY
-jizydliu2vclvpdtcrajlvuq2m,3g6mdd3tvjabdaez6mwcycso3q,Status.STRONG,OK.PREPRINT_PUBLISHED
+jizydliu2vclvpdtcrajlvuq2m,3g6mdd3tvjabdaez6mwcycso3q,Status.EXACT,OK.WORK_ID
fvrscdvsznb4zlhuadd6ar7ot4,57la45yryjd73gav22bnl4lyni,Status.STRONG,OK.FIGSHARE_VERSION
6fedywjyynbxhdqv3etxjuqhba,gls2x7ca4nhzrkf437gdnj6ekq,Status.DIFFERENT,Miss.YEAR
7lepq6lyyfepdjat6ohpeqycdu,cfm6qhhxovferl2fahf6jmcsiu,Status.DIFFERENT,Miss.YEAR
-ijbm7t2mpjcrrjazrmeli6b42a,7ijg4ar62rgo3olfbxltltrzc4,Status.EXACT,OK.TITLE_AUTHOR_MATCH
+ijbm7t2mpjcrrjazrmeli6b42a,7ijg4ar62rgo3olfbxltltrzc4,Status.EXACT,OK.WORK_ID
hyt2ebpmhjg53f5eu4v5zortfm,ceu2t7fapvg43bvyyqck344pei,Status.DIFFERENT,Miss.SUBTITLE
uhih3c4gbzdtnciiqlfjx3w6le,lgga6cjz6bgo7cszpjfhpuoaqi,Status.DIFFERENT,Miss.YEAR
53w5pycrmvgglludwsv44m3czu,mvdjwqdvxfh3vd3zotf3gljm4a,Status.DIFFERENT,Miss.SUBTITLE
6vejogvunnbb7etjzu4yfs32mm,g53ggmce2rek5lw2l52oaimgiq,Status.DIFFERENT,Miss.SUBTITLE
325je3kjkjeerkchimvz6qxyji,ir7i7ldr7ffuvigvv6cvyyc7ju,Status.DIFFERENT,Miss.BOOK_CHAPTER
-hqwrsqnzdjbqhbrqnsbooohqse,ydx2wolhvffxnb6as6gekmocx4,Status.STRONG,OK.ARXIV_VERSION
+hqwrsqnzdjbqhbrqnsbooohqse,ydx2wolhvffxnb6as6gekmocx4,Status.EXACT,OK.WORK_ID
vz7q453kr5ds3ptsldwxedbiii,2wzybzqlmjhjfh75cxjohbvzi4,Status.DIFFERENT,Miss.RELEASE_TYPE
-efumvvpw6jbb7ehp2qfdatgxzy,funn7cwjbrgefji27tzpl4avuu,Status.STRONG,OK.ARXIV_VERSION
+efumvvpw6jbb7ehp2qfdatgxzy,funn7cwjbrgefji27tzpl4avuu,Status.EXACT,OK.WORK_ID
pjvosq3ulzeb5d6w7zijrbz75y,pxkm2asxjnflzkdi5qnfd5fpt4,Status.DIFFERENT,Miss.BOOK_CHAPTER
ji3qg5sajndt7p54u7wumqsjye,hxau2e34bnhhbeucfdrncgmcby,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
2gpvznjjcfbmhats6ot2vsodju,qk6arua2snaobfvdvlfvjp3yeq,Status.AMBIGUOUS,
@@ -114,3 +114,4 @@ iwtrxnov2repzlgoi2at2md6tm,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS,
s5hm65waingwjmgf3plu76hzu4,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS,
zlywxoy7cfexvaatziqp4ip5m4,phqelg6oc5hs5dehhgmodcnh5u,Status.EXACT,OK.DATACITE_VERSION
vqjpcuqxnbhdtelzspxjmklm7u,knuzh5bcqbg7ph7ffvqaiwevti,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW
+psykbwxylndtdaand2ymtkgzqu,xizkwvsodzajnn4u7lgeldqoum,Status.DIFFERENT,Miss.YEAR