diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-03 21:58:11 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-03 21:58:11 +0100 |
commit | f32c435c207f439abb66de5dbb6a1b67a75d0405 (patch) | |
tree | 8604255831074571437f7c346cf2f17c39458992 | |
parent | 560091e02fd90a98cb79c808d8bb2000c60b9e67 (diff) | |
download | fuzzycat-f32c435c207f439abb66de5dbb6a1b67a75d0405.tar.gz fuzzycat-f32c435c207f439abb66de5dbb6a1b67a75d0405.zip |
add case
-rw-r--r-- | fuzzycat/common.py | 1 | ||||
-rw-r--r-- | fuzzycat/verify.py | 12 | ||||
-rw-r--r-- | notes/todo.md | 1 | ||||
-rw-r--r-- | tests/data/release/phqelg6oc5hs5dehhgmodcnh5u | 24 | ||||
-rw-r--r-- | tests/data/release/zlywxoy7cfexvaatziqp4ip5m4 | 24 | ||||
-rw-r--r-- | tests/data/verify.csv | 1 | ||||
-rw-r--r-- | tests/test_verify.py | 4 |
7 files changed, 65 insertions, 2 deletions
diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 60f42ab..07f62df 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -45,6 +45,7 @@ class Miss(str, Enum): CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' CUSTOM_IOP_MA_PATTERN = 'miss.custom_iop_ma_pattern' CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288' + CUSTOM_PREFIX_10_7916 = 'miss.custom_prefix_10_7916' CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla DATASET_DOI = 'miss.dataset_doi' NUM_DIFF = 'miss.num_diff' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 993b7c9..6d824d4 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -212,6 +212,7 @@ def compare(a, b): if re.match(r"appendix ?[^ ]*$", a_title_lower): return (Status.AMBIGUOUS, Miss.APPENDIX) + try: # TODO: figshare versions, "xxx.v1" FIGSHARE_PREFIX = "10.6084/" @@ -332,6 +333,17 @@ def compare(a, b): if a_slug_title == b_slug_title: try: + # https://dlc.library.columbia.edu/lcaaj/cul:p5hqbzkhxb, + # https://dlc.library.columbia.edu/lcaaj/cul:5tb2rbp0nj + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + if has_doi_prefix(a_doi, "10.7916") and has_doi_prefix(b_doi, "10.7916"): + return (Status.AMBIGUOUS, Miss.CUSTOM_PREFIX_10_7916) + except PathAccessError: + pass + + if a_slug_title == b_slug_title: + try: a_subtitles = glom(a, "extra.subtitle") or [] b_subtitles = glom(b, "extra.subtitle") or [] for a_sub in a_subtitles: diff --git a/notes/todo.md b/notes/todo.md index 97313ce..b3474f8 100644 --- a/notes/todo.md +++ b/notes/todo.md @@ -91,3 +91,4 @@ Blacklist fragment. * [ ] https://fatcat.wiki/release/zlywxoy7cfexvaatziqp4ip5m4 https://fatcat.wiki/release/phqelg6oc5hs5dehhgmodcnh5u Status.AMBIGUOUS OK.DUMMY +> one item contains more md, but the physical entity seems to be the same; 0058904_001 vs 0058904 diff --git a/tests/data/release/phqelg6oc5hs5dehhgmodcnh5u b/tests/data/release/phqelg6oc5hs5dehhgmodcnh5u new file mode 100644 index 0000000..2386a7f --- /dev/null +++ b/tests/data/release/phqelg6oc5hs5dehhgmodcnh5u @@ -0,0 +1,24 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d8rr2t04" + }, + "extra": { + "datacite": { + "metadataVersion": 3 + }, + "release_month": 8 + }, + "ident": "phqelg6oc5hs5dehhgmodcnh5u", + "publisher": "Columbia University", + "refs": [], + "release_date": "2017-08-17", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "revision": "7fa57bde-8630-47a2-9f33-f87257e77679", + "state": "active", + "title": "Eastern questionnaire, answer sheet for Interviewee 52223, page 194", + "work_id": "sqm5fkxxnjhf7fo2ln4lzgevja" +} diff --git a/tests/data/release/zlywxoy7cfexvaatziqp4ip5m4 b/tests/data/release/zlywxoy7cfexvaatziqp4ip5m4 new file mode 100644 index 0000000..4ed1fe3 --- /dev/null +++ b/tests/data/release/zlywxoy7cfexvaatziqp4ip5m4 @@ -0,0 +1,24 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.7916/d8n0229j" + }, + "extra": { + "datacite": { + "metadataVersion": 2 + }, + "release_month": 8 + }, + "ident": "zlywxoy7cfexvaatziqp4ip5m4", + "publisher": "Columbia University", + "refs": [], + "release_date": "2017-08-17", + "release_stage": "published", + "release_type": "article", + "release_year": 2017, + "revision": "766922dd-17c4-4d2c-bdf5-ece79413d300", + "state": "active", + "title": "Eastern questionnaire, answer sheet for Interviewee 52223, page 194", + "work_id": "rnhzlq2y5fbe5dnynfh7qnnlf4" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 058a471..16ee457 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -112,3 +112,4 @@ zkqujozrx5cnjitmglclt6heqq,urr2gs4dsbbwdl7asgyqnwwtxy,Status.AMBIGUOUS,Miss.BLAC iwtrxnov2repzlgoi2at2md6tm,s5hm65waingwjmgf3plu76hzu4,Status.AMBIGUOUS, iwtrxnov2repzlgoi2at2md6tm,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS, s5hm65waingwjmgf3plu76hzu4,t6k5mec4xjdebcs3iv3uzs3yvu,Status.AMBIGUOUS, +zlywxoy7cfexvaatziqp4ip5m4,phqelg6oc5hs5dehhgmodcnh5u,Status.EXACT,OK.DATACITE_VERSION diff --git a/tests/test_verify.py b/tests/test_verify.py index a80dcda..533a5ba 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -46,8 +46,8 @@ def test_compare(): .format(a=a, b=b, base=FATCAT_BASE_URL, status=status, reason=reason)) continue assert status_mapping[ - expected_status] == status, "status: want {}, got {} for {} {}".format( - expected_status, status, a, b) + expected_status] == status, "status: want {}, got {} {} for {} {}".format( + expected_status, status, reason, a, b) if expected_reason: assert expected_reason.lower() == reason.lower( ), "reason [{base}release/{a} {base}release/{b}]: want {reason}, got {expected_reason}".format( |