""" Verification part of matching. Clustering results in a documents with keys and values, where values is a list of entites associated with a cluster. { "v": [ {...}, ... ], "k": "1 Grundlagen", } The list of documents will often contain false positives. The `verify` routine is a way to get a match quality assessment. > Notes TODO: allow to pass in a DOI blacklist, e.g. a list of DOI which are not valid any more; example: https://fatcat.wiki/release/azbcyqjnmrdofigpgk24ck4rpq, https://fatcat.wiki/release/eb2uf5ae7bedxe22jasf2l3faa Author matching: one long string; e.g. as last name; take an acronym of the first name; asian names; number of authors; what works specifically for the various md extractors Contributor lists; "one that have the index set"; affiliations may end up there; "subset" is an ordered list; pubmed, crossref important > Stats Stats from running over a full database dump. We need to run verification over 25586837 entity pairs, of which we 1346217/25586837 (or about 5%) are too ambiguous at this time. Found Status Reason -------------------------------------------------------------------------- 3450874 Status.EXACT Reason.TITLE_AUTHOR_MATCH 2619990 Status.STRONG Reason.SLUG_TITLE_AUTHOR_MATCH 2487633 Status.DIFFERENT Reason.YEAR 2434532 Status.EXACT Reason.WORK_ID 2085006 Status.DIFFERENT Reason.CONTRIB_INTERSECTION_EMPTY 1397420 Status.DIFFERENT Reason.SHARED_DOI_PREFIX 1355852 Status.DIFFERENT Reason.RELEASE_TYPE 1290162 Status.AMBIGUOUS Reason.UNKNOWN 1145511 Status.DIFFERENT Reason.BOOK_CHAPTER 1009657 Status.DIFFERENT Reason.DATASET_DOI 996503 Status.STRONG Reason.PMID_DOI_PAIR 868951 Status.EXACT Reason.DATACITE_VERSION 796216 Status.STRONG Reason.DATACITE_RELATED_ID 704154 Status.STRONG Reason.FIGSHARE_VERSION 534963 Status.STRONG Reason.VERSIONED_DOI 343310 Status.STRONG Reason.TOKENIZED_AUTHORS 334974 Status.STRONG Reason.JACCARD_AUTHORS 293835 Status.STRONG Reason.PREPRINT_PUBLISHED 269366 Status.DIFFERENT Reason.COMPONENT 263626 Status.DIFFERENT Reason.SUBTITLE 224021 Status.AMBIGUOUS Reason.SHORT_TITLE 152990 Status.DIFFERENT Reason.PAGE_COUNT 133811 Status.AMBIGUOUS Reason.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW 122600 Status.AMBIGUOUS Reason.CUSTOM_PREFIX_10_7916 79664 Status.STRONG Reason.CUSTOM_IEEE_ARXIV 46649 Status.DIFFERENT Reason.CUSTOM_PREFIX_10_14288 39797 Status.DIFFERENT Reason.JSTOR_ID 38598 Status.STRONG Reason.CUSTOM_BSI_UNDATED 18907 Status.STRONG Reason.CUSTOM_BSI_SUBDOC 15465 Status.EXACT Reason.DOI 13393 Status.DIFFERENT Reason.CUSTOM_IOP_MA_PATTERN 10378 Status.DIFFERENT Reason.CONTAINER 3081 Status.AMBIGUOUS Reason.BLACKLISTED 2504 Status.AMBIGUOUS Reason.BLACKLISTED_FRAGMENT 1273 Status.AMBIGUOUS Reason.APPENDIX 1063 Status.DIFFERENT Reason.TITLE_FILENAME 104 Status.DIFFERENT Reason.NUM_DIFF 4 Status.STRONG Reason.ARXIV_VERSION """ import collections import itertools import json import operator import re import sys from typing import Counter, Dict, Tuple, Type from fatcat_openapi_client import ReleaseEntity from glom import PathAccessError, glom from fuzzycat.common import Reason, Status from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST, TITLE_FRAGMENT_BLACKLIST) from fuzzycat.entities import entity_to_dict from fuzzycat.utils import (author_similarity_score, clean_doi, contains_chemical_formula, dict_has_key, doi_prefix, has_doi_prefix, jaccard, num_project, parse_page_string, slugify_string) Verify = collections.namedtuple("Verify", "status reason") class GroupVerifier: """ Given an iterable (lines) from clustering step, run verification on every pair of the cluster, e.g. a cluster of four elements will yield nCr(4, 2) = 10 pairs. """ def __init__(self, iterable: collections.abc.Iterable, max_cluster_size: int = 10, verbose=False): self.iterable: collections.abc.Iterable = iterable self.max_cluster_size: int = max_cluster_size self.verbose: bool = verbose self.counter: Counter = collections.Counter() def run(self): # The result of clustering are documents that have a key k and a list of values # (of the cluster) v. get_key_values = operator.itemgetter("k", "v") for i, line in enumerate(self.iterable): if i % 20000 == 0 and self.verbose: print(i, file=sys.stderr) line = line.strip() if not line: continue doc = json.loads(line) k, vs = get_key_values(doc) if len(vs) < 2: self.counter[Reason.SINGULAR_CLUSTER] += 1 continue if len(vs) > self.max_cluster_size: self.counter[Reason.MAX_CLUSTER_SIZE_EXCEEDED] += 1 continue for a, b in itertools.combinations(vs, r=2): for re in (a, b): container_name = re.get("extra", {}).get("container_name", "") or "" if container_name.lower().strip() in CONTAINER_NAME_BLACKLIST: self.counter[Reason.CONTAINER_NAME_BLACKLIST] += 1 continue if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST: self.counter[Reason.PUBLISHER_BLACKLIST] += 1 continue result, reason = verify(a, b) self.counter[reason] += 1 print("https://fatcat.wiki/release/{}".format(a["ident"]), "https://fatcat.wiki/release/{}".format(b["ident"]), result, reason) self.counter["total"] = sum(v for _, v in self.counter.items()) def verify_release_entities(a: ReleaseEntity, b: ReleaseEntity, min_title_length=5) -> Type[Verify]: return verify(entity_to_dict(a), entity_to_dict(b), min_title_length=min_title_length) def verify(a: Dict, b: Dict, min_title_length=5) -> Tuple[str, str]: """ Compare two entities (dicts), return tuple of match status and reason. Note: This is a too long function, but tested so open to near-term refactor. On the plus, it is uniform in the sense, that a few lines take care of a specific case. The cases are relatively independent, so order should not matter, but that can be pinned down much more clearly. TODO: add wrapper release entities """ # A few items have the same DOI. try: a_doi = clean_doi(glom(a, "ext_ids.doi")) b_doi = clean_doi(glom(b, "ext_ids.doi")) if a_doi is not None and a_doi == b_doi: return Verify(Status.EXACT, Reason.DOI) except PathAccessError: pass # Some pre-verified pairs. if a.get("work_id") and a.get("work_id") == b.get("work_id"): return Verify(Status.EXACT, Reason.WORK_ID) a_title = a.get("title", "") or "" a_title_lower = a_title.lower() b_title = b.get("title", "") or "" b_title_lower = b_title.lower() if len(a_title) < min_title_length: return Verify(Status.AMBIGUOUS, Reason.SHORT_TITLE) if a_title_lower in TITLE_BLACKLIST: return Verify(Status.AMBIGUOUS, Reason.BLACKLISTED) for fragment in TITLE_FRAGMENT_BLACKLIST: if fragment in a_title_lower: return Verify(Status.AMBIGUOUS, Reason.BLACKLISTED_FRAGMENT) # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi, # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi if "subject index" in a_title_lower and "subject index" in b_title_lower: try: if glom(a, "container_id") != glom(b, "container_id"): return Verify(Status.DIFFERENT, Reason.CONTAINER) except PathAccessError: pass # Datacite keeps track of versions. try: if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom( b, "extra.datacite.metadataVersion"): return Verify(Status.EXACT, Reason.DATACITE_VERSION) except PathAccessError: pass # UBC repository, we assume that different items in the same pool. try: prefix = "10.14288/" a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if a_doi.startswith(prefix) and b_doi.startswith(prefix) and a_doi != b_doi: # UBC metadata slightly off; # https://fatcat.wiki/release/63g4ukdxajcqhdytqla6du3t3u, # https://fatcat.wiki/release/rz72bzfevzeofdeb342c6z45qu; # https://api.datacite.org/application/vnd.datacite.datacite+json/10.14288/1.0011045 return Verify(Status.DIFFERENT, Reason.CUSTOM_PREFIX_10_14288) except PathAccessError: pass # The British Standards Institution (BSI) keeps various version of # standards around, among them an "undated" variant. # Reference to subdocument. # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34 # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy try: a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"): if a_doi + "u" == b_doi or b_doi + "u" == a_doi: return Verify(Status.STRONG, Reason.CUSTOM_BSI_UNDATED) if a_title == b_title and ( (dict_has_key(a, "extra.subtitle") and not dict_has_key(b, "extra.subtitle")) or (dict_has_key(b, "extra.subtitle") and not dict_has_key(a, "extra.subtitle"))): return Verify(Status.STRONG, Reason.CUSTOM_BSI_SUBDOC) except PathAccessError: pass # IOP science. try: prefix = "10.1149" a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, prefix) and has_doi_prefix(b_doi, prefix): v = "{}/ma".format(prefix) if (a_doi.startswith(v) and not b_doi.startswith(v) or b_doi.startswith(v) and not a_doi.startswith(v)): return Verify(Status.DIFFERENT, Reason.CUSTOM_IOP_MA_PATTERN) except PathAccessError: pass # Very manual, XXX: move this into blacklist. if "Zweckverband Volkshochschule " in a_title and a_title != b_title: return Verify(Status.DIFFERENT, Reason.CUSTOM_VHS) if re.match(r"appendix ?[^ ]*$", a_title_lower): return Verify(Status.AMBIGUOUS, Reason.APPENDIX) # Figshare, versions. try: FIGSHARE_PREFIX = "10.6084/" if glom(a, "ext_ids.doi").startswith(FIGSHARE_PREFIX) and glom( b, "ext_ids.doi").startswith(FIGSHARE_PREFIX): a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(a, "ext_ids.doi")) b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(b, "ext_ids.doi")) if a_doi_v_stripped == b_doi_v_stripped: return Verify(Status.STRONG, Reason.FIGSHARE_VERSION) except PathAccessError: pass # Generic, versioned DOI. # https://fatcat.wiki/release/cwqujxztefdghhssb7ysxj7b5m # https://fatcat.wiki/release/hwnqyz7n65eabhlivvkipkytji try: a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") versioned_doi_pattern = '10[.].*/v[0-9]{1,}$' if re.match(versioned_doi_pattern, a_doi) and re.match(versioned_doi_pattern, b_doi): return Verify(Status.STRONG, Reason.VERSIONED_DOI) except PathAccessError: pass # A paper/component pattern. 10.1021/acs.cgd.7b00396, # https://fatcat.wiki/release/c43itb7esjc3heb64xbohigqge, # https://fatcat.wiki/release/6kuxfopbcjcrdnhvfokjgbd5wm try: a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]: return Verify(Status.STRONG, Reason.VERSIONED_DOI) except PathAccessError: pass # Datacite related identifiers. # extra.datacite.relations[].{relationType=IsNewerVersionOf,relatedIdentifier=10...} # beware: we have versions and "isPartOf", e.g. # https://api.fatcat.wiki/v0/release/ybxygpeypbaq5pfrztu3z2itw4 # Datacite md schema: https://doi.org/10.14454/7xq3-zf69 if dict_has_key(a, "extra.datacite") and dict_has_key(b, "extra.datacite"): whitelist = set([ "HasPart", "HasVersion", "IsNewVersionOf", "IsPartOf", "IsPreviousVersionOf", "IsVersionOf", ]) def get_datacite_related_doi(doc): spec = ("extra.datacite.relations", [{ "type": "relatedIdentifierType", "id": "relatedIdentifier" }]) try: return set([v["id"] for v in glom(doc, spec) if v["type"].lower() == "doi"]) except PathAccessError: return set() a_doi_rel = get_datacite_related_doi(a) b_doi_rel = get_datacite_related_doi(b) try: if glom(b, "ext_ids.doi") in a_doi_rel or glom(a, "ext_ids.doi") in b_doi_rel: return Verify(Status.STRONG, Reason.DATACITE_RELATED_ID) except PathAccessError: pass # Arxiv versions. try: id_a = re.match(r"(.*)v[0-9]{1,}$", glom(a, "ext_ids.arxiv")).group(1) id_b = re.match(r"(.*)v[0-9]{1,}$", glom(b, "ext_ids.arxiv")).group(1) if id_a == id_b: return Verify(Status.STRONG, Reason.ARXIV_VERSION) except (AttributeError, ValueError, PathAccessError) as exc: pass try: if glom(a, "release_type") != glom(b, "release_type"): # TODO(martin): This can go wrong with "article" and "article-journal" # TODO(martin): Some arxiv articles are marked are release_type: report # or paper-conference # (https://fatcat.wiki/release/l4fyyvsckneuxkq7d3y2zvkvbe) types = set([a.get("release_type"), b.get("release_type")]) # Added "entry" via # https://fatcat.wiki/release/xp3oxb7tqbgaxdzkzbchfkcjn4, # https://fatcat.wiki/release/73pcaauzwbalvi7aqhv6vopxl4 ignore_release_types = set([ "article", "article-journal", "report", "paper-conference", ]) if len(types & ignore_release_types) == 0: return Verify(Status.DIFFERENT, Reason.RELEASE_TYPE) if "dataset" in types and ("article" in types or "article-journal" in types): return Verify(Status.DIFFERENT, Reason.RELEASE_TYPE) if "book" in types and ("article" in types or "article-journal" in types): return Verify(Status.DIFFERENT, Reason.RELEASE_TYPE) except PathAccessError: pass # Datasets are typically different (and have less md and look similar). try: if (glom(a, "release_type") == "dataset" and glom(b, "release_type") == "dataset" and glom(a, "ext_ids.doi") != glom(b, "ext_ids.doi")): return Verify(Status.DIFFERENT, Reason.DATASET_DOI) except PathAccessError: pass # Common chapter names should be handled here. try: if (glom(a, "release_type") == "chapter" and glom(b, "release_type") == "chapter" and glom(a, "extra.container_name") != glom(b, "extra.container_name")): return Verify(Status.DIFFERENT, Reason.BOOK_CHAPTER) except PathAccessError: pass # Components tend to have similar names. try: if glom(a, "extra.crossref.type") == "component" and glom(a, "title") != glom(b, "title"): return Verify(Status.DIFFERENT, Reason.COMPONENT) except PathAccessError: pass try: if glom(a, "release_type") == "component" and glom(b, "release_type") == "component": a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if a_doi != b_doi: return Verify(Status.DIFFERENT, Reason.COMPONENT) except PathAccessError: pass # https://fatcat.wiki/release/knzhequchfcethcyyi3gsp5gry, some title contain newlines a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ") b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ") # https://fatcat.wiki/release/psykbwxylndtdaand2ymtkgzqu # https://fatcat.wiki/release/xizkwvsodzajnn4u7lgeldqoum if a_slug_title == b_slug_title: a_year = a.get("release_year") b_year = b.get("release_year") if a_year and b_year and abs(a_year - b_year) > 40: return Verify(Status.DIFFERENT, Reason.YEAR) if a_slug_title == b_slug_title: # via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi # https://fatcat.wiki/release/tur236mqljdfdnlzbbnks2sily def ieee_arxiv_pair_check(a, b): try: if (glom(a, "ext_ids.doi").split("/")[0] == "10.1109" and glom(b, "ext_ids.arxiv") != ""): return Verify(Status.STRONG, Reason.CUSTOM_IEEE_ARXIV) except PathAccessError: pass # TODO: we might want to have some light python DSL to express these # (commute) things result = ieee_arxiv_pair_check(a, b) if result: return result result = ieee_arxiv_pair_check(b, a) if result: return result if a_slug_title == b_slug_title: try: # https://dlc.library.columbia.edu/lcaaj/cul:p5hqbzkhxb, # https://dlc.library.columbia.edu/lcaaj/cul:5tb2rbp0nj a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, "10.7916") and has_doi_prefix(b_doi, "10.7916"): return Verify(Status.AMBIGUOUS, Reason.CUSTOM_PREFIX_10_7916) except PathAccessError: pass if a_slug_title == b_slug_title: try: a_subtitles = glom(a, "extra.subtitle") or [] b_subtitles = glom(b, "extra.subtitle") or [] for a_sub in a_subtitles: for b_sub in b_subtitles: if slugify_string(a_sub) != slugify_string(b_sub): return Verify(Status.DIFFERENT, Reason.SUBTITLE) except PathAccessError: pass arxiv_id_a = a.get("ext_ids", {}).get("arxiv") arxiv_id_b = b.get("ext_ids", {}).get("arxiv") a_authors = set([v.get("raw_name") for v in a.get("contribs", [])]) b_authors = set([v.get("raw_name") for v in b.get("contribs", [])]) a_slug_authors = set((slugify_string(v) for v in a_authors if v)) b_slug_authors = set((slugify_string(v) for v in b_authors if v)) a_release_year = a.get("release_year") b_release_year = b.get("release_year") if a_title_lower == b_title_lower: if a_authors and (a_authors == b_authors): # TODO: https://fatcat.wiki/release/utx5r5e6azbvljipznv7ejqzvq, # https://fatcat.wiki/release/oceozrqtcbc4tloizhddxaj2ti # preprint and published work may not be published in the same # year; compromise allow a small gap if a_release_year and b_release_year and abs(int(a_release_year) - int(b_release_year)) > 4: return Verify(Status.DIFFERENT, Reason.YEAR) return Verify(Status.EXACT, Reason.TITLE_AUTHOR_MATCH) if (len(a.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{2,3}", a.get("title", "")) or len(b.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{2,3}$", b.get("title", ""))): if a.get("title") != b.get("title"): return Verify(Status.DIFFERENT, Reason.TITLE_FILENAME) if a.get("title") and a.get("title") == b.get("title"): if a_release_year and b_release_year: if abs(int(a_release_year) - int(b_release_year)) > 2: return Verify(Status.DIFFERENT, Reason.YEAR) if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and ( a_slug_title != b_slug_title): return Verify(Status.DIFFERENT, Reason.CHEM_FORMULA) if len(a_slug_title) < 10 and a_slug_title != b_slug_title: return Verify(Status.AMBIGUOUS, Reason.SHORT_TITLE) if re.search(r'\d+', a_slug_title) and a_slug_title != b_slug_title and num_project( a_slug_title) == num_project(b_slug_title): return Verify(Status.DIFFERENT, Reason.NUM_DIFF) if a_slug_title and b_slug_title and a_slug_title == b_slug_title: if a_authors and len(a_authors & b_authors) > 0: if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None: return Verify(Status.STRONG, Reason.PREPRINT_PUBLISHED) if a_slug_title and b_slug_title and a_slug_title.strip().replace( " ", "") == b_slug_title.strip().replace(" ", ""): if len(a_slug_authors & b_slug_authors) > 0: # At this point, year might differ, e.g. # https://fatcat.wiki/release/2n7pyugxenb73gope52bn6m2ru vs # https://fatcat.wiki/release/p4bettvcszgn5d3zls5ogdjk4u (found via refs). if a_release_year and b_release_year and abs(int(a_release_year) - int(b_release_year)) > 4: return Verify(Status.DIFFERENT, Reason.YEAR) return Verify(Status.STRONG, Reason.SLUG_TITLE_AUTHOR_MATCH) # if any([a_authors, b_authors]) and not (a_authors and b_authors): # Does not cover case, where both authors are empty. if a_release_year == b_release_year and a_title_lower == b_title_lower: if ((dict_has_key(a, "ext_ids.pmid") and dict_has_key(b, "ext_ids.doi")) or (dict_has_key(b, "ext_ids.pmid") and dict_has_key(a, "ext_ids.doi"))): return Verify(Status.STRONG, Reason.PMID_DOI_PAIR) # Two JSTOR items will probably be different. try: a_jstor_id = glom(a, "ext_ids.jstor") b_jstor_id = glom(b, "ext_ids.jstor") if a_jstor_id != b_jstor_id: return Verify(Status.DIFFERENT, Reason.JSTOR_ID) except PathAccessError: pass # Publication from same publisher and different DOI or year a probably # different. try: a_container_id = glom(a, "container_id") b_container_id = glom(b, "container_id") a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if a_container_id == b_container_id and a_doi != b_doi and not has_doi_prefix( a_doi, "10.1126") and doi_prefix(a_doi) == doi_prefix(b_doi): return Verify(Status.DIFFERENT, Reason.SHARED_DOI_PREFIX) except PathAccessError: pass if a_authors and len(a_slug_authors & b_slug_authors) == 0: # Before we bail out, run an authors similarity check. TODO: This is # not the right place, but lives here now, since these cases popped up # in this block. Score = collections.namedtuple("Score", "a b value") scores = [] # account for the possible arbitrary ordering of authors, XXX: this # explodes. a_trimmed = sorted(a_slug_authors)[:5] b_trimmed = sorted(b_slug_authors)[:5] num_authors = min(len(a_trimmed), len(b_trimmed)) for a, b in itertools.product(a_trimmed, b_trimmed): scores.append(Score(a, b, author_similarity_score(a, b))) # TODO: less arbitrary metric and threshold top_scores = [] for _, g in itertools.groupby(scores, key=lambda s: s.a): sorted_scores = sorted(g, key=lambda s: s.value, reverse=True) if len(sorted_scores) > 0: top_scores.append(sorted_scores[0].value) if len(top_scores) > 0: avg_score = sum(top_scores) / len(top_scores) if (num_authors < 3 and avg_score > 0.9) or (num_authors >= 3 and avg_score > 0.5): return Verify(Status.STRONG, Reason.TOKENIZED_AUTHORS) else: pass # Kuidong Xu, Joong Ki Choi, Eun Jin Yang, Kyu Chul Lee, Yanli Lei # J.K. Choi, K. Xu, E.J. Yang, K.C. Lee, Y. Lei # 0.2942857142857143 # print("author comp score: {}".format(avg_score)) # Fallback jaccard token comparison. # Kuidong Xu, Joong Ki Choi, Eun Jin Yang, Kyu Chul Lee, Yanli Lei # J.K. Choi, K. Xu, E.J. Yang, K.C. Lee, Y. Lei # avg_score was 0.2942857142857143, but jaccard ~0.38 a_tok = [tok for tok in re.findall(r"[\w]{3,}", " ".join(a_slug_authors)) if tok] b_tok = [tok for tok in re.findall(r"[\w]{3,}", " ".join(b_slug_authors)) if tok] if jaccard(set(a_tok), set(b_tok)) > 0.35: return Verify(Status.STRONG, Reason.JACCARD_AUTHORS) # TODO: This misses spelling differences, e.g. # https://fatcat.wiki/release/7nbcgsohrrak5cuyk6dnit6ega and # https://fatcat.wiki/release/q66xv7drk5fnph7enwwlkyuwqm return Verify(Status.DIFFERENT, Reason.CONTRIB_INTERSECTION_EMPTY) # mark choicereview articles as ambiguous, as they seem to be behind a paywall try: a_doi = glom(a, "ext_ids.doi") b_doi = glom(b, "ext_ids.doi") if has_doi_prefix(a_doi, "10.5860") or has_doi_prefix(b_doi, "10.5860"): return Verify(Status.AMBIGUOUS, Reason.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW) except PathAccessError: pass # If pages exists, but differ too much, bail out. # https://fatcat.wiki/release/tm3gaiumkvb3xc7t3i6suna6u4 # https://fatcat.wiki/release/r6dj63wh3zcrrolisn6xuacnve try: a_parsed_pages = parse_page_string(glom(a, "pages")) b_parsed_pages = parse_page_string(glom(b, "pages")) if (a_parsed_pages.count != None and b_parsed_pages.count != None and abs(a_parsed_pages.count - b_parsed_pages.count) > 5): return Verify(Status.DIFFERENT, Reason.PAGE_COUNT) except (ValueError, PathAccessError): pass # A variant of translated titles, e.g. # https://fatcat.wiki/release/search?q=%22A+nova+classifica%C3%A7%C3%A3o+dos+tumores+da+mama+%22 try: a_container_id = glom(a, "container_id") b_container_id = glom(b, "container_id") if a_authors == b_authors and a_container_id == b_container_id and a_release_year == b_release_year and a_title != b_title and ( a_title in b_title or b_title in a_title): return Verify(Status.STRONG, Reason.TITLE_ARTIFACT) except PathAccessError: pass return Verify(Status.AMBIGUOUS, Reason.UNKNOWN)