From 09b737d1182837635856dfa2fcc4c384dfd58aaf Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 12 Dec 2020 02:21:59 +0100 Subject: get rid of 'ok' and 'miss' --- fuzzycat/verify.py | 104 ++++++++++++++++++++++++++--------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) (limited to 'fuzzycat/verify.py') diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 94e8327..973ea13 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -74,7 +74,7 @@ import sys from glom import PathAccessError, glom -from fuzzycat.common import OK, Miss, Status +from fuzzycat.common import Status, Reason from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST, TITLE_FRAGMENT_BLACKLIST) from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists, @@ -115,19 +115,19 @@ class GroupVerifier: doc = json.loads(line) k, vs = get_key_values(doc) if len(vs) < 2: - self.counter["skip.unique"] += 1 + self.counter[Reason.SINGULAR_CLUSTER] += 1 continue if len(vs) > self.max_cluster_size: - self.counter["skip.too_large"] += 1 + self.counter[Reason.MAX_CLUSTER_SIZE_EXCEEDED] += 1 continue for a, b in itertools.combinations(vs, r=2): for re in (a, b): container_name = re.get("extra", {}).get("container_name", "") or "" if container_name.lower().strip() in CONTAINER_NAME_BLACKLIST: - self.counter["skip.container_name_blacklist"] += 1 + self.counter[Reason.CONTAINER_NAME_BLACKLIST] += 1 continue if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST: - self.counter["skip.publisher_blacklist"] += 1 + self.counter[Reason.PUBLISHER_BLACKLIST] += 1 continue result, reason = verify(a, b) self.counter[reason] += 1 @@ -146,12 +146,12 @@ def verify(a, b): """ try: if glom(a, "ext_ids.doi") == glom(b, "ext_ids.doi"): - return (Status.EXACT, OK.DOI) + return (Status.EXACT, Reason.DOI) except PathAccessError: pass if a.get("work_id") and a.get("work_id") == b.get("work_id"): - return (Status.EXACT, OK.WORK_ID) + return (Status.EXACT, Reason.WORK_ID) a_title = a.get("title", "") a_title_lower = a_title.lower() @@ -159,27 +159,27 @@ def verify(a, b): b_title_lower = b_title.lower() if len(a_title) < 5: - return (Status.AMBIGUOUS, Miss.SHORT_TITLE) + return (Status.AMBIGUOUS, Reason.SHORT_TITLE) if a_title_lower in TITLE_BLACKLIST: - return (Status.AMBIGUOUS, Miss.BLACKLISTED) + return (Status.AMBIGUOUS, Reason.BLACKLISTED) for fragment in TITLE_FRAGMENT_BLACKLIST: if fragment in a_title_lower: - return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) + return (Status.AMBIGUOUS, Reason.BLACKLISTED_FRAGMENT) # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi, # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi if "subject index" in a_title_lower and "subject index" in b_title_lower: try: if glom(a, "container_id") != glom(b, "container_id"): - return (Status.DIFFERENT, Miss.CONTAINER) + return (Status.DIFFERENT, Reason.CONTAINER) except PathAccessError: pass try: if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom( b, "extra.datacite.metadataVersion"): - return (Status.EXACT, OK.DATACITE_VERSION) + return (Status.EXACT, Reason.DATACITE_VERSION) except PathAccessError: pass @@ -191,7 +191,7 @@ def verify(a, b): # https://fatcat.wiki/release/63g4ukdxajcqhdytqla6du3t3u, # https://fatcat.wiki/release/rz72bzfevzeofdeb342c6z45qu; # https://api.datacite.org/application/vnd.datacite.datacite+json/10.14288/1.0011045 - return (Status.DIFFERENT, Miss.CUSTOM_PREFIX_10_14288) + return (Status.DIFFERENT, Reason.CUSTOM_PREFIX_10_14288) except PathAccessError: pass @@ -200,7 +200,7 @@ def verify(a, b): b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): if a_doi + "u" == b_doi or b_doi + "u" == a_doi: - return (Status.STRONG, OK.CUSTOM_BSI_UNDATED) + return (Status.STRONG, Reason.CUSTOM_BSI_UNDATED) # Reference to subdocument. # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34 # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy @@ -208,7 +208,7 @@ def verify(a, b): and not dict_key_exists(b, "extra.subtitle")) or (dict_key_exists(b, "extra.subtitle") and not dict_key_exists(a, "extra.subtitle"))): - return (Status.STRONG, OK.CUSTOM_BSI_SUBDOC) + return (Status.STRONG, Reason.CUSTOM_BSI_SUBDOC) except PathAccessError: pass @@ -218,15 +218,15 @@ def verify(a, b): if has_doi_prefix(a_doi, "10.1149") and has_doi_prefix(b_doi, "10.1149"): if (a_doi.startswith("10.1149/ma") and not b_doi.startswith("10.1149/ma") or b_doi.startswith("10.1149/ma") and not a_doi.startswith("10.1149/ma")): - return (Status.DIFFERENT, Miss.CUSTOM_IOP_MA_PATTERN) + return (Status.DIFFERENT, Reason.CUSTOM_IOP_MA_PATTERN) except PathAccessError: pass if "Zweckverband Volkshochschule " in a_title and a_title != b_title: - return (Status.DIFFERENT, Miss.CUSTOM_VHS) + return (Status.DIFFERENT, Reason.CUSTOM_VHS) if re.match(r"appendix ?[^ ]*$", a_title_lower): - return (Status.AMBIGUOUS, Miss.APPENDIX) + return (Status.AMBIGUOUS, Reason.APPENDIX) try: # TODO: figshare versions, "xxx.v1" @@ -236,7 +236,7 @@ def verify(a, b): a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(a, "ext_ids.doi")) b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(b, "ext_ids.doi")) if a_doi_v_stripped == b_doi_v_stripped: - return (Status.STRONG, OK.FIGSHARE_VERSION) + return (Status.STRONG, Reason.FIGSHARE_VERSION) except PathAccessError: pass @@ -247,7 +247,7 @@ def verify(a, b): b_doi = glom(b, "ext_ids.doi") versioned_doi_pattern = '10[.].*/v[0-9]{1,}$' if re.match(versioned_doi_pattern, a_doi) and re.match(versioned_doi_pattern, b_doi): - return (Status.STRONG, OK.VERSIONED_DOI) + return (Status.STRONG, Reason.VERSIONED_DOI) except PathAccessError: pass @@ -258,7 +258,7 @@ def verify(a, b): a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]: - return (Status.STRONG, OK.VERSIONED_DOI) + return (Status.STRONG, Reason.VERSIONED_DOI) except PathAccessError: pass @@ -291,7 +291,7 @@ def verify(a, b): b_doi_rel = get_datacite_related_doi(b) try: if glom(b, "ext_ids.doi") in a_doi_rel or glom(a, "ext_ids.doi") in b_doi_rel: - return (Status.STRONG, OK.DATACITE_RELATED_ID) + return (Status.STRONG, Reason.DATACITE_RELATED_ID) except PathAccessError: pass @@ -299,7 +299,7 @@ def verify(a, b): id_a = re.match(r"(.*)v[0-9]{1,}$", glom(a, "ext_ids.arxiv")).group(1) id_b = re.match(r"(.*)v[0-9]{1,}$", glom(b, "ext_ids.arxiv")).group(1) if id_a == id_b: - return (Status.STRONG, OK.ARXIV_VERSION) + return (Status.STRONG, Reason.ARXIV_VERSION) except (AttributeError, ValueError, PathAccessError) as exc: pass @@ -320,31 +320,31 @@ def verify(a, b): "paper-conference", ]) if len(types & ignore_release_types) == 0: - return (Status.DIFFERENT, Miss.RELEASE_TYPE) + return (Status.DIFFERENT, Reason.RELEASE_TYPE) if "dataset" in types and ("article" in types or "article-journal" in types): - return (Status.DIFFERENT, Miss.RELEASE_TYPE) + return (Status.DIFFERENT, Reason.RELEASE_TYPE) if "book" in types and ("article" in types or "article-journal" in types): - return (Status.DIFFERENT, Miss.RELEASE_TYPE) + return (Status.DIFFERENT, Reason.RELEASE_TYPE) except PathAccessError: pass try: if (glom(a, "release_type") == "dataset" and glom(b, "release_type") == "dataset" and glom(a, "ext_ids.doi") != glom(b, "ext_ids.doi")): - return (Status.DIFFERENT, Miss.DATASET_DOI) + return (Status.DIFFERENT, Reason.DATASET_DOI) except PathAccessError: pass try: if (glom(a, "release_type") == "chapter" and glom(b, "release_type") == "chapter" and glom(a, "extra.container_name") != glom(b, "extra.container_name")): - return (Status.DIFFERENT, Miss.BOOK_CHAPTER) + return (Status.DIFFERENT, Reason.BOOK_CHAPTER) except PathAccessError: pass try: if glom(a, "extra.crossref.type") == "component" and glom(a, "title") != glom(b, "title"): - return (Status.DIFFERENT, Miss.COMPONENT) + return (Status.DIFFERENT, Reason.COMPONENT) except PathAccessError: pass @@ -353,7 +353,7 @@ def verify(a, b): a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if a_doi != b_doi: - return (Status.DIFFERENT, Miss.COMPONENT) + return (Status.DIFFERENT, Reason.COMPONENT) except PathAccessError: pass @@ -367,7 +367,7 @@ def verify(a, b): a_year = a.get("release_year") b_year = b.get("release_year") if a_year and b_year and abs(a_year - b_year) > 40: - return (Status.DIFFERENT, Miss.YEAR) + return (Status.DIFFERENT, Reason.YEAR) if a_slug_title == b_slug_title: # via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi @@ -376,7 +376,7 @@ def verify(a, b): try: if (glom(a, "ext_ids.doi").split("/")[0] == "10.1109" and glom(b, "ext_ids.arxiv") != ""): - return (Status.STRONG, OK.CUSTOM_IEEE_ARXIV) + return (Status.STRONG, Reason.CUSTOM_IEEE_ARXIV) except PathAccessError: pass @@ -396,7 +396,7 @@ def verify(a, b): a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, "10.7916") and has_doi_prefix(b_doi, "10.7916"): - return (Status.AMBIGUOUS, Miss.CUSTOM_PREFIX_10_7916) + return (Status.AMBIGUOUS, Reason.CUSTOM_PREFIX_10_7916) except PathAccessError: pass @@ -407,7 +407,7 @@ def verify(a, b): for a_sub in a_subtitles: for b_sub in b_subtitles: if slugify_string(a_sub) != slugify_string(b_sub): - return (Status.DIFFERENT, Miss.SUBTITLE) + return (Status.DIFFERENT, Reason.SUBTITLE) except PathAccessError: pass @@ -429,54 +429,54 @@ def verify(a, b): # year; compromise allow a small gap if a_release_year and b_release_year and abs(int(a_release_year) - int(b_release_year)) > 1: - return (Status.DIFFERENT, Miss.YEAR) - return (Status.EXACT, OK.TITLE_AUTHOR_MATCH) + return (Status.DIFFERENT, Reason.YEAR) + return (Status.EXACT, Reason.TITLE_AUTHOR_MATCH) if (len(a.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{2,3}", a.get("title", "")) or len(b.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{2,3}$", b.get("title", ""))): if a.get("title") != b.get("title"): - return (Status.DIFFERENT, Miss.TITLE_FILENAME) + return (Status.DIFFERENT, Reason.TITLE_FILENAME) if a.get("title") and a.get("title") == b.get("title"): if a_release_year and b_release_year: if abs(int(a_release_year) - int(b_release_year)) > 2: - return (Status.DIFFERENT, Miss.YEAR) + return (Status.DIFFERENT, Reason.YEAR) if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and ( a_slug_title != b_slug_title): - return (Status.DIFFERENT, Miss.CHEM_FORMULA) + return (Status.DIFFERENT, Reason.CHEM_FORMULA) if len(a_slug_title) < 10 and a_slug_title != b_slug_title: - return (Status.AMBIGUOUS, Miss.SHORT_TITLE) + return (Status.AMBIGUOUS, Reason.SHORT_TITLE) if re.search(r'\d+', a_slug_title) and a_slug_title != b_slug_title and num_project( a_slug_title) == num_project(b_slug_title): - return (Status.DIFFERENT, Miss.NUM_DIFF) + return (Status.DIFFERENT, Reason.NUM_DIFF) if a_slug_title and b_slug_title and a_slug_title == b_slug_title: if a_authors and len(a_authors & b_authors) > 0: if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None: - return (Status.STRONG, OK.PREPRINT_PUBLISHED) + return (Status.STRONG, Reason.PREPRINT_PUBLISHED) if a_slug_title and b_slug_title and a_slug_title.strip().replace( " ", "") == b_slug_title.strip().replace(" ", ""): if len(a_slug_authors & b_slug_authors) > 0: - return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH) + return (Status.STRONG, Reason.SLUG_TITLE_AUTHOR_MATCH) # if any([a_authors, b_authors]) and not (a_authors and b_authors): # Does not cover case, where both authors are empty. if a_release_year == b_release_year and a_title_lower == b_title_lower: if ((dict_key_exists(a, "ext_ids.pmid") and dict_key_exists(b, "ext_ids.doi")) or (dict_key_exists(b, "ext_ids.pmid") and dict_key_exists(a, "ext_ids.doi"))): - return (Status.STRONG, OK.PMID_DOI_PAIR) + return (Status.STRONG, Reason.PMID_DOI_PAIR) # Two JSTOR items will probably be different. try: a_jstor_id = glom(a, "ext_ids.jstor") b_jstor_id = glom(b, "ext_ids.jstor") if a_jstor_id != b_jstor_id: - return (Status.DIFFERENT, Miss.JSTOR_ID) + return (Status.DIFFERENT, Reason.JSTOR_ID) except PathAccessError: pass @@ -490,7 +490,7 @@ def verify(a, b): if a_container_id == b_container_id and a_doi != b_doi and not has_doi_prefix( a_doi, "10.1126"): - return (Status.DIFFERENT, Miss.SHARED_DOI_PREFIX) + return (Status.DIFFERENT, Reason.SHARED_DOI_PREFIX) except PathAccessError: pass @@ -515,7 +515,7 @@ def verify(a, b): if len(top_scores) > 0: avg_score = sum(top_scores) / len(top_scores) if avg_score > 0.5: - return (Status.STRONG, OK.TOKENIZED_AUTHORS) + return (Status.STRONG, Reason.TOKENIZED_AUTHORS) else: pass # Kuidong Xu, Joong Ki Choi, Eun Jin Yang, Kyu Chul Lee, Yanli Lei @@ -530,19 +530,19 @@ def verify(a, b): a_tok = [tok for tok in re.findall(r"[\w]{3,}", " ".join(a_slug_authors)) if tok] b_tok = [tok for tok in re.findall(r"[\w]{3,}", " ".join(b_slug_authors)) if tok] if jaccard(set(a_tok), set(b_tok)) > 0.35: - return (Status.STRONG, OK.JACCARD_AUTHORS) + return (Status.STRONG, Reason.JACCARD_AUTHORS) # TODO: This misses spelling differences, e.g. # https://fatcat.wiki/release/7nbcgsohrrak5cuyk6dnit6ega and # https://fatcat.wiki/release/q66xv7drk5fnph7enwwlkyuwqm - return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY) + return (Status.DIFFERENT, Reason.CONTRIB_INTERSECTION_EMPTY) # mark choicereview articles as ambiguous, as they seem to be behind a paywall try: a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, "10.5860") or has_doi_prefix(b_doi, "10.5860"): - return (Status.AMBIGUOUS, Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW) + return (Status.AMBIGUOUS, Reason.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW) except PathAccessError: pass @@ -562,8 +562,8 @@ def verify(a, b): b_num_pages = int(b_end) - int(b_start) if a_num_pages >= 0 and b_num_pages >= 0: if abs(a_num_pages - b_num_pages) > 5: - return (Status.DIFFERENT, Miss.PAGE_COUNT) + return (Status.DIFFERENT, Reason.PAGE_COUNT) except PathAccessError: pass - return (Status.AMBIGUOUS, OK.DUMMY) + return (Status.AMBIGUOUS, Reason.DUMMY) -- cgit v1.2.3