From e5bcf8ba46b6851b677078358b7ffd26072c2523 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 11 Dec 2020 02:17:06 +0100 Subject: add generic doi version case --- fuzzycat/utils.py | 14 ++++++ fuzzycat/verify.py | 33 +++++++------ tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm | 67 +++++++++++++++++++++++++++ tests/data/release/c43itb7esjc3heb64xbohigqge | 20 ++++++++ tests/data/verify.csv | 3 +- tests/test_verify.py | 6 +-- 6 files changed, 122 insertions(+), 21 deletions(-) create mode 100644 tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm create mode 100644 tests/data/release/c43itb7esjc3heb64xbohigqge diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index ef3b418..2dc2adb 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -3,12 +3,26 @@ import itertools import re import string +from glom import glom, PathAccessError + printable_no_punct = string.digits + string.ascii_letters + string.whitespace # More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/ CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+") +def dict_key_exists(doc, path): + """ + Return true, if a value at a given path exists. XXX: probably in glom, too. + """ + try: + _ = glom(doc, path) + except PathAccessError: + return False + else: + return True + + def has_doi_prefix(v, prefix="10.1234"): """ Returns False, if we cannot parse v or prefix does not match. diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 9f5aa4f..94e8327 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -77,8 +77,8 @@ from glom import PathAccessError, glom from fuzzycat.common import OK, Miss, Status from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST, TITLE_FRAGMENT_BLACKLIST) -from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, has_doi_prefix, - jaccard, num_project, slugify_string) +from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists, + has_doi_prefix, jaccard, num_project, slugify_string) # The result of clustering are documents that have a key k and a list of values # (of the cluster) v. @@ -129,7 +129,7 @@ class GroupVerifier: if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST: self.counter["skip.publisher_blacklist"] += 1 continue - result, reason = compare(a, b) + result, reason = verify(a, b) self.counter[reason] += 1 print("https://fatcat.wiki/release/{}".format(a["ident"]), "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason) @@ -137,21 +137,9 @@ class GroupVerifier: self.counter["total"] = sum(v for _, v in self.counter.items()) -def dict_key_exists(doc, path): +def verify(a, b): """ - Return true, if a value at a given path exists. XXX: probably in glom, too. - """ - try: - _ = glom(doc, path) - except PathAccessError: - return False - else: - return True - - -def compare(a, b): - """ - Compare two entities, return match status and reason. + Compare two entities (dicts), return tuple of match status and reason. TODO: We might want a bunch of kwargs for things like year gap threshold and the like. @@ -263,6 +251,17 @@ def compare(a, b): except PathAccessError: pass + # A paper/component pattern. 10.1021/acs.cgd.7b00396, + # https://fatcat.wiki/release/c43itb7esjc3heb64xbohigqge, + # https://fatcat.wiki/release/6kuxfopbcjcrdnhvfokjgbd5wm + try: + a_doi = glom(a, "ext_ids.doi") + b_doi = glom(b, "ext_ids.doi") + if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]: + return (Status.STRONG, OK.VERSIONED_DOI) + except PathAccessError: + pass + # TODO: datacite specific vocabulary # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...} # beware: we have versions and "isPartOf", e.g. https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4 diff --git a/tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm b/tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm new file mode 100644 index 0000000..9e028a8 --- /dev/null +++ b/tests/data/release/6kuxfopbcjcrdnhvfokjgbd5wm @@ -0,0 +1,67 @@ +{ + "abstracts": [], + "container_id": "tfncqskjxjgbvilbdcq654of3m", + "contribs": [ + { + "creator_id": "xf7grzc3gjfg5kbwbgfvppmn5m", + "extra": { + "seq": "first" + }, + "index": 0, + "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan", + "raw_name": "Tsutomu Shinagawa", + "role": "author" + }, + { + "index": 1, + "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan", + "raw_name": "Mitsuru Watanabe", + "role": "author" + }, + { + "index": 2, + "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan", + "raw_name": "Jun-ichi Tani", + "role": "author" + }, + { + "index": 3, + "raw_affiliation": "Electronic Materials Research\nDivision, Osaka Municipal Technical Research Institute, Joto-ku, Osaka 536-8553, Japan", + "raw_name": "Masaya Chigane", + "role": "author" + } + ], + "ext_ids": { + "doi": "10.1021/acs.cgd.7b00396" + }, + "extra": { + "crossref": { + "alternative-id": [ + "10.1021/acs.cgd.7b00396" + ], + "funder": [ + { + "DOI": "10.13039/501100001700", + "award": [], + "doi-asserted-by": "publisher", + "name": "Ministry of Education, Culture, Sports, Science and Technology" + } + ], + "type": "journal-article" + } + }, + "ident": "6kuxfopbcjcrdnhvfokjgbd5wm", + "language": "en", + "pages": "3826-3833", + "publisher": "American Chemical Society (ACS)", + "refs": [], + "release_date": "2017-06-08", + "release_stage": "published", + "release_type": "article-journal", + "release_year": 2017, + "revision": "feea6f2d-0996-4e3a-bd8d-1c543a228699", + "state": "active", + "title": "(0001)-Oriented Single-Crystal-Like Porous ZnO on ITO Substrates via Quasi-Topotactic Transformation from (001)-Oriented Zinc Hydroxychloride Crystals", + "volume": "17", + "work_id": "qc6y573kejhurfijmya7gmymeq" +} diff --git a/tests/data/release/c43itb7esjc3heb64xbohigqge b/tests/data/release/c43itb7esjc3heb64xbohigqge new file mode 100644 index 0000000..7f975ab --- /dev/null +++ b/tests/data/release/c43itb7esjc3heb64xbohigqge @@ -0,0 +1,20 @@ +{ + "abstracts": [], + "contribs": [], + "ext_ids": { + "doi": "10.1021/acs.cgd.7b00396.s001" + }, + "extra": { + "crossref": { + "type": "component" + } + }, + "ident": "c43itb7esjc3heb64xbohigqge", + "publisher": "American Chemical Society (ACS)", + "refs": [], + "release_type": "component", + "revision": "728aa1ed-533d-4517-9738-384b76ae69b8", + "state": "active", + "title": "(0001)-Oriented Single-Crystal-Like Porous ZnO on ITO Substrates via Quasi-Topotactic Transformation from (001)-Oriented Zinc Hydroxychloride Crystals", + "work_id": "cum2sjlwkbazzbhf43iq3vozuu" +} diff --git a/tests/data/verify.csv b/tests/data/verify.csv index 0439292..d90dc96 100644 --- a/tests/data/verify.csv +++ b/tests/data/verify.csv @@ -148,7 +148,7 @@ pobnow7sxfhnxhltgwpru5k7oi,uplqxenmk5axjes6zokml6q73y,Status.DIFFERENT,Miss.RELE tm3gaiumkvb3xc7t3i6suna6u4,pobnow7sxfhnxhltgwpru5k7oi,Status.DIFFERENT,Miss.RELEASE_TYPE lqswbciv2vfkzit5zamjaqik6m,zularouecbg5fg4nd6yswxf3s4,Status.DIFFERENT,Miss.JSTOR_ID j6ipokw3lfflhl2de7afxhac2a,rbgpleyhanakxing2f3234d7xq,Status.AMBIGUOUS, -bruczmzvnzhtdkd2tf3meg3oou,a7wuehxrv5edpb5265qx27yvmy,Status.AMBIGUOUS, +bruczmzvnzhtdkd2tf3meg3oou,a7wuehxrv5edpb5265qx27yvmy,Status.STRONG,OK.VERSIONED_DOI tebqkxnjpzfxnpsqmt5klv2ppm,uqyjav3arngq7bqmzsllxrkpmu,Status.DIFFERENT, e3fs7ttdbrds3bvsbm7lynzlpu,vpswmj3cgfhktggwvmz33fkwuq,Status.DIFFERENT, gtsbvudmjzdeppqgzjpmfedycq,27lrseg7jfhxbdxohph7il7a7m,Status.DIFFERENT,Miss.JSTOR_ID @@ -159,3 +159,4 @@ qnblx3fetbegpe7ryt444dpkke,kokj44xkcfhxvorj7cs7rov2ku,Status.DIFFERENT,Miss.RELE vrwrf372jbd2vbwcb6fllsvhae,s43ecmng5bbqzcqhxmo7wbfsma,Status.DIFFERENT,Miss.RELEASE_TYPE 4z2amr4cizd2jexlr7uu4jxrsa,nvyd2rotrraelcuchnu6cjbxty,Status.STRONG,OK.PMID_DOI_PAIR qqsdtxm5hjadta3jf7bgt3bnm4,fupvtkn7t5d5xohffx5bt4yn24,Status.AMBIGUOUS, +6kuxfopbcjcrdnhvfokjgbd5wm,c43itb7esjc3heb64xbohigqge,Status.STRONG, diff --git a/tests/test_verify.py b/tests/test_verify.py index 07808af..79c3143 100644 --- a/tests/test_verify.py +++ b/tests/test_verify.py @@ -5,7 +5,7 @@ import os import pytest -from fuzzycat.verify import Status, compare +from fuzzycat.verify import Status, verify VERIFY_CSV = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/verify.csv") RELEASE_ENTITIES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/release") @@ -32,7 +32,7 @@ def load_release_ident(ident): return json.load(f) -def test_compare(): +def test_verify(): with open(VERIFY_CSV) as f: reader = csv.reader(f, delimiter=',') for i, row in enumerate(reader): @@ -42,7 +42,7 @@ def test_compare(): pytest.fail( "invalid test file, maybe too many (or few) commas in row {}? {}".format( i + 1, exc)) - status, reason = compare(load_release_ident(a), load_release_ident(b)) + status, reason = verify(load_release_ident(a), load_release_ident(b)) if not expected_status or expected_status.lower() == "todo": logger.warning( "skipping test {base}release/{a} {base}release/{b} -- no result defined (we think {status}, {reason})" -- cgit v1.2.3