aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/verify.py13
-rw-r--r--tests/data/release/nb4yakyqebalbatnnfijkfhmka43
-rw-r--r--tests/data/release/pr7e4l5eibaavm3zsk62nmphni54
-rw-r--r--tests/data/verify.csv9
-rw-r--r--tests/test_verify.py9
5 files changed, 120 insertions, 8 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 0457e92..a223e48 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -138,6 +138,9 @@ class GroupVerifier:
def compare(a, b):
"""
Compare two entities, return match status and reason.
+
+ TODO: We might want a bunch of kwargs for things like year gap threshold
+ and the like.
"""
try:
if glom(a, "ext_ids.doi") == glom(b, "ext_ids.doi"):
@@ -215,7 +218,6 @@ def compare(a, b):
if re.match(r"appendix ?[^ ]*$", a_title_lower):
return (Status.AMBIGUOUS, Miss.APPENDIX)
-
try:
# TODO: figshare versions, "xxx.v1"
FIGSHARE_PREFIX = "10.6084/"
@@ -314,6 +316,14 @@ def compare(a, b):
a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
+ # https://fatcat.wiki/release/psykbwxylndtdaand2ymtkgzqu
+ # https://fatcat.wiki/release/xizkwvsodzajnn4u7lgeldqoum
+ if a_slug_title == b_slug_title:
+ a_year = a.get("release_year")
+ b_year = b.get("release_year")
+ if a_year and b_year and abs(a_year - b_year) > 40:
+ return (Status.DIFFERENT, Miss.YEAR)
+
if a_slug_title == b_slug_title:
# via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi
# https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily
@@ -446,7 +456,6 @@ def compare(a, b):
except PathAccessError:
pass
-
return (Status.AMBIGUOUS, OK.DUMMY)
diff --git a/tests/data/release/nb4yakyqebalbatnnfijkfhmka b/tests/data/release/nb4yakyqebalbatnnfijkfhmka
new file mode 100644
index 0000000..445fb13
--- /dev/null
+++ b/tests/data/release/nb4yakyqebalbatnnfijkfhmka
@@ -0,0 +1,43 @@
+{
+ "abstracts": [],
+ "container_id": "tol7woxlqjeg5bmzadeg6qrg3e",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Giacomo Rodano",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Nicolas Andre Benigno Serrano-Velarde",
+ "role": "author"
+ },
+ {
+ "index": 2,
+ "raw_name": "Emanuele Tarantino",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.2139/ssrn.2645730"
+ },
+ "extra": {
+ "crossref": {
+ "type": "journal-article"
+ }
+ },
+ "ident": "nb4yakyqebalbatnnfijkfhmka",
+ "language": "en",
+ "publisher": "Elsevier BV",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2015,
+ "revision": "6352e747-a5aa-487f-9a6e-b7a50dc5e2b4",
+ "state": "active",
+ "title": "Bankruptcy Law and Bank Financing",
+ "work_id": "ctu7mu3bongg7pbrf5n3cernde"
+}
diff --git a/tests/data/release/pr7e4l5eibaavm3zsk62nmphni b/tests/data/release/pr7e4l5eibaavm3zsk62nmphni
new file mode 100644
index 0000000..c27d39c
--- /dev/null
+++ b/tests/data/release/pr7e4l5eibaavm3zsk62nmphni
@@ -0,0 +1,54 @@
+{
+ "abstracts": [],
+ "container_id": "exrvddewvnb4pffd64fjacpoq4",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "index": 0,
+ "raw_name": "Giacomo Rodano",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Nicolas Serrano-Velarde",
+ "role": "author"
+ },
+ {
+ "index": 2,
+ "raw_name": "Emanuele Tarantino",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1016/j.jfineco.2016.01.016"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "S0304405X16000210"
+ ],
+ "subject": [
+ "Strategy and Management",
+ "Economics and Econometrics",
+ "Accounting",
+ "Finance"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "pr7e4l5eibaavm3zsk62nmphni",
+ "language": "en",
+ "pages": "363-382",
+ "publisher": "Elsevier BV",
+ "refs": [],
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2016,
+ "revision": "2de55471-3503-4399-b790-2b2dc385c75f",
+ "state": "active",
+ "title": "Bankruptcy law and bank financing",
+ "volume": "120",
+ "work_id": "7uclti36yngurb374f4mcty2jy"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index 72baebb..0f95905 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -8,7 +8,7 @@ mn26hwbmqvh23jhsecoder3ixq,544v67u75fazfp5qssqzmh6fta,Status.DIFFERENT,Miss.YEAR
4srjsirjhvhvtenz23lg6bqnqu,3czbwace7bh4hkfehzntnddt2i,Status.EXACT,OK.WORK_ID
vokr6qxyqrc55kyn45dyavr2lq,b5helm53ljdxjpxdnn5zjqpjve,Status.EXACT,OK.WORK_ID
kgeynply6vcxdeiluu6es6w72m,cm536ige6bfdfhhesp26ibfdva,Status.EXACT,OK.DATACITE_VERSION
-knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
+knwc764q25f33ib6qnwo7pyaui,n74tqiqi5jcx5d6vl5f7lpokaa,Status.DIFFERENT,Miss.YEAR
eo4qptzoqrholjslj7nemlne2y,zisq3tsezjcejinlpf7qgk6z2i,Status.DIFFERENT,Miss.YEAR
crsd5c2fhvd7hodbd4trne3lgi,4547ybo5hvf4xhlh5triaccxai,Status.DIFFERENT,Miss.YEAR
egxon2iqljf47c4stvacnccvwy,swuxb5owx5g4hff3c7ur5x3awy,Status.DIFFERENT,Miss.YEAR
@@ -39,7 +39,7 @@ fmeud4dykjfudb5kjr2fgmaneq,iid2bnrjjbegtpgmpuppjou4k4,Status.DIFFERENT,Miss.SUBT
zmivcpjvhba25ldkx27d24oefa,mjapiqe2nzcy3fs3hriw253dye,Status.STRONG,OK.FIGSHARE_VERSION
lynlkp7wh5hn3mlpzcfz4faoqi,yrbvjd4xrjaq3jxt7pkheysclm,Status.DIFFERENT,Miss.YEAR
t3vpox5wrvbgtcigp6a6o64oey,q5yaj5zbzjctzapb5bztzctsoe,Status.DIFFERENT,Miss.YEAR
-65qtai5dmjb2hmkwa73nwafyhu,p4lk4tbohjat3g5nn5pb3kjdyu,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
+65qtai5dmjb2hmkwa73nwafyhu,p4lk4tbohjat3g5nn5pb3kjdyu,Status.DIFFERENT,Miss.YEAR
fqtc2tonfbh7hlcwoxgxzqi4lu,ng7utp7murge3ksuzbtljf5bsq,Status.DIFFERENT,Miss.YEAR
mbnr3nrdijerto6wfjnlsmfhga,ddikrsxnajblvchthiwcbsmiue,Status.STRONG,OK.DATACITE_RELATED_ID
nqfv37as6bcohketfrhiuac2mq,ty6megtz35c3hep57bbx2cetja,Status.DIFFERENT,Miss.YEAR
@@ -61,7 +61,7 @@ muk4xhjhubc3xn6qqddllgfsly,2gywie7yqfflnl6tljfo36keqi,Status.EXACT,OK.WORK_ID
iywyis7npngxxbco6fgjrclrzy,anhsfjxg3few5nkfsvheehiebq,Status.DIFFERENT,Miss.BOOK_CHAPTER
rk7mn5uaqjaslgcxc2nl6ijpaq,td3rnxzbxzeslj6ijoce3mtxcq,Status.EXACT,OK.WORK_ID
ohkfrjjcxfcavoqoqt52wi6eke,egufgu3yubgthex3y7fdt7uupa,Status.DIFFERENT,Miss.DATASET_DOI
-dklwsz4w3rdlfddif4pcxb6ngm,wsbinmv7lragjnaedbgws6bztm,Status.AMBIGUOUS,OK.DUMMY
+dklwsz4w3rdlfddif4pcxb6ngm,wsbinmv7lragjnaedbgws6bztm,Status.DIFFERENT,Miss.YEAR
jizydliu2vclvpdtcrajlvuq2m,3g6mdd3tvjabdaez6mwcycso3q,Status.EXACT,OK.WORK_ID
fvrscdvsznb4zlhuadd6ar7ot4,57la45yryjd73gav22bnl4lyni,Status.STRONG,OK.FIGSHARE_VERSION
6fedywjyynbxhdqv3etxjuqhba,gls2x7ca4nhzrkf437gdnj6ekq,Status.DIFFERENT,Miss.YEAR
@@ -76,7 +76,7 @@ hqwrsqnzdjbqhbrqnsbooohqse,ydx2wolhvffxnb6as6gekmocx4,Status.EXACT,OK.WORK_ID
vz7q453kr5ds3ptsldwxedbiii,2wzybzqlmjhjfh75cxjohbvzi4,Status.DIFFERENT,Miss.RELEASE_TYPE
efumvvpw6jbb7ehp2qfdatgxzy,funn7cwjbrgefji27tzpl4avuu,Status.EXACT,OK.WORK_ID
pjvosq3ulzeb5d6w7zijrbz75y,pxkm2asxjnflzkdi5qnfd5fpt4,Status.DIFFERENT,Miss.BOOK_CHAPTER
-ji3qg5sajndt7p54u7wumqsjye,hxau2e34bnhhbeucfdrncgmcby,Status.DIFFERENT,Miss.CONTRIB_INTERSECTION_EMPTY
+ji3qg5sajndt7p54u7wumqsjye,hxau2e34bnhhbeucfdrncgmcby,Status.DIFFERENT,Miss.YEAR
2gpvznjjcfbmhats6ot2vsodju,qk6arua2snaobfvdvlfvjp3yeq,Status.AMBIGUOUS,
75ky5xniobchzbhzwhmwhu5uoa,uvgwfvwnnbg7xchy63bloyrwvi,Status.STRONG,
jdtngtiz3bdqboypujoni2x3ry,byh7xr5qhjca3bw53ivdotck3e,Status.EXACT,
@@ -115,3 +115,4 @@ s5hm65waingwjmgf3plu76hzu4,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS,
zlywxoy7cfexvaatziqp4ip5m4,phqelg6oc5hs5dehhgmodcnh5u,Status.EXACT,OK.DATACITE_VERSION
vqjpcuqxnbhdtelzspxjmklm7u,knuzh5bcqbg7ph7ffvqaiwevti,Status.AMBIGUOUS,Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW
psykbwxylndtdaand2ymtkgzqu,xizkwvsodzajnn4u7lgeldqoum,Status.DIFFERENT,Miss.YEAR
+nb4yakyqebalbatnnfijkfhmka,pr7e4l5eibaavm3zsk62nmphni,Status.STRONG,
diff --git a/tests/test_verify.py b/tests/test_verify.py
index 533a5ba..75b1190 100644
--- a/tests/test_verify.py
+++ b/tests/test_verify.py
@@ -46,8 +46,13 @@ def test_compare():
.format(a=a, b=b, base=FATCAT_BASE_URL, status=status, reason=reason))
continue
assert status_mapping[
- expected_status] == status, "status: want {}, got {} {} for {} {}".format(
- expected_status, status, reason, a, b)
+ expected_status] == status, "status: want {expected_status}, got {status} {reason} for {base}release/{a} {base}release/{b}".format(
+ expected_status=expected_status,
+ status=status,
+ reason=reason,
+ base=FATCAT_BASE_URL,
+ a=a,
+ b=b)
if expected_reason:
assert expected_reason.lower() == reason.lower(
), "reason [{base}release/{a} {base}release/{b}]: want {reason}, got {expected_reason}".format(