diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2020-12-12 02:21:59 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2020-12-12 02:21:59 +0100 | 
| commit | 09b737d1182837635856dfa2fcc4c384dfd58aaf (patch) | |
| tree | 8009c74a2e98d1037312fc95c2ec71a3380b4744 /fuzzycat | |
| parent | 7c22f54bb2b0a3f40b15ee18ee1063a7f3659c95 (diff) | |
| download | fuzzycat-09b737d1182837635856dfa2fcc4c384dfd58aaf.tar.gz fuzzycat-09b737d1182837635856dfa2fcc4c384dfd58aaf.zip | |
get rid of 'ok' and 'miss'
Diffstat (limited to 'fuzzycat')
| -rw-r--r-- | fuzzycat/common.py | 98 | ||||
| -rw-r--r-- | fuzzycat/verify.py | 104 | 
2 files changed, 100 insertions, 102 deletions
| diff --git a/fuzzycat/common.py b/fuzzycat/common.py index 2bb25c6..9ee1239 100644 --- a/fuzzycat/common.py +++ b/fuzzycat/common.py @@ -3,7 +3,8 @@ from enum import Enum  class Status(str, Enum):      """ -    Match status. +    Match status. The match status "TODO" is a placeholder, e.g. to trigger a +    test failure.      """      AMBIGUOUS = 'ambiguous'      DIFFERENT = 'different' @@ -13,54 +14,51 @@ class Status(str, Enum):      TODO = 'todo' -class OK(str, Enum): +class Reason(str, Enum):      """ -    Reason for assuming we have a match. +    Reason for assuming we have a match or miss. No hard rules on naming.      """ -    ARXIV_VERSION = 'ok.arxiv_version' -    CUSTOM_BSI_SUBDOC = 'ok.custom_bsi_subdoc' -    CUSTOM_BSI_UNDATED = 'ok.custom_bsi_undated' -    CUSTOM_IEEE_ARXIV = 'ok.custom_ieee_arxiv' -    DATACITE_RELATED_ID = 'ok.datacite_related_id' -    DATACITE_VERSION = 'ok.datacite_version' -    DOI = 'ok.doi' -    DUMMY = 'ok.dummy' -    FIGSHARE_VERSION = 'ok.figshare_version' -    JACCARD_AUTHORS = 'ok.jaccard_authors' -    PMID_DOI_PAIR = 'ok.pmid_doi_pair' -    PREPRINT_PUBLISHED = 'ok.preprint_published' -    SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match' -    TITLE_AUTHOR_MATCH = 'ok.title_author_match' -    TOKENIZED_AUTHORS = 'ok.tokenized_authors' -    VERSIONED_DOI = 'ok.versioned_doi' -    WORK_ID = 'ok.work_id' - - -class Miss(str, Enum): -    """ -    Reasons indicating mismatch. -    """ -    APPENDIX = 'miss.appendix' -    ARXIV_VERSION = 'miss.arxiv_version' -    BLACKLISTED = 'miss.blacklisted' -    BLACKLISTED_FRAGMENT = 'miss.blacklisted_fragment' -    BOOK_CHAPTER = 'miss.book_chapter' -    CHEM_FORMULA = 'miss.chem_formula' -    COMPONENT = 'miss.component' -    CONTAINER = 'miss.container' -    CONTRIB_INTERSECTION_EMPTY = 'miss.contrib_intersection_empty' -    CUSTOM_IOP_MA_PATTERN = 'miss.custom_iop_ma_pattern' -    CUSTOM_PREFIX_10_14288 = 'miss.custom_prefix_10_14288' -    CUSTOM_PREFIX_10_5860_CHOICE_REVIEW = 'miss.custom_prefix_10_5860_choice_review' -    CUSTOM_PREFIX_10_7916 = 'miss.custom_prefix_10_7916' -    CUSTOM_VHS = 'miss.vhs'  # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla -    DATASET_DOI = 'miss.dataset_doi' -    JSTOR_ID = 'miss.jstor_id' -    NUM_DIFF = 'miss.num_diff' -    PAGE_COUNT = 'miss.page_count' -    RELEASE_TYPE = 'miss.release_type' -    SHARED_DOI_PREFIX = 'miss.shared_doi_prefix' -    SHORT_TITLE = 'miss.short_title' -    SUBTITLE = 'miss.subtitle' -    TITLE_FILENAME = 'miss.title_filename' -    YEAR = 'miss.year' +    APPENDIX = 'appendix' +    ARXIV_VERSION = 'arxiv_version' +    BLACKLISTED = 'blacklisted' +    BLACKLISTED_FRAGMENT = 'blacklisted_fragment' +    BOOK_CHAPTER = 'book_chapter' +    CHEM_FORMULA = 'chem_formula' +    COMPONENT = 'component' +    CONTAINER = 'container' +    CONTAINER_NAME_BLACKLIST = 'container_name_blacklist' +    CONTRIB_INTERSECTION_EMPTY = 'contrib_intersection_empty' +    CUSTOM_BSI_SUBDOC = 'custom_bsi_subdoc' +    CUSTOM_BSI_UNDATED = 'custom_bsi_undated' +    CUSTOM_IEEE_ARXIV = 'custom_ieee_arxiv' +    CUSTOM_IOP_MA_PATTERN = 'custom_iop_ma_pattern' +    CUSTOM_PREFIX_10_14288 = 'custom_prefix_10_14288' +    CUSTOM_PREFIX_10_5860_CHOICE_REVIEW = 'custom_prefix_10_5860_choice_review' +    CUSTOM_PREFIX_10_7916 = 'custom_prefix_10_7916' +    CUSTOM_VHS = 'vhs'  # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla +    DATACITE_RELATED_ID = 'datacite_related_id' +    DATACITE_VERSION = 'datacite_version' +    DATASET_DOI = 'dataset_doi' +    DOI = 'doi' +    DUMMY = 'dummy' +    FIGSHARE_VERSION = 'figshare_version' +    JACCARD_AUTHORS = 'jaccard_authors' +    JSTOR_ID = 'jstor_id' +    MAX_CLUSTER_SIZE_EXCEEDED = 'max_cluster_size_exceeded' +    NUM_DIFF = 'num_diff' +    PAGE_COUNT = 'page_count' +    PMID_DOI_PAIR = 'pmid_doi_pair' +    PREPRINT_PUBLISHED = 'preprint_published' +    PUBLISHER_BLACKLIST = 'publisher_blacklist' +    RELEASE_TYPE = 'release_type' +    SHARED_DOI_PREFIX = 'shared_doi_prefix' +    SHORT_TITLE = 'short_title' +    SINGULAR_CLUSTER = 'singular_cluster' +    SLUG_TITLE_AUTHOR_MATCH = 'slug_title_author_match' +    SUBTITLE = 'subtitle' +    TITLE_AUTHOR_MATCH = 'title_author_match' +    TITLE_FILENAME = 'title_filename' +    TOKENIZED_AUTHORS = 'tokenized_authors' +    VERSIONED_DOI = 'versioned_doi' +    WORK_ID = 'work_id' +    YEAR = 'year' diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index 94e8327..973ea13 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -74,7 +74,7 @@ import sys  from glom import PathAccessError, glom -from fuzzycat.common import OK, Miss, Status +from fuzzycat.common import Status, Reason  from fuzzycat.data import (CONTAINER_NAME_BLACKLIST, PUBLISHER_BLACKLIST, TITLE_BLACKLIST,                             TITLE_FRAGMENT_BLACKLIST)  from fuzzycat.utils import (author_similarity_score, contains_chemical_formula, dict_key_exists, @@ -115,19 +115,19 @@ class GroupVerifier:              doc = json.loads(line)              k, vs = get_key_values(doc)              if len(vs) < 2: -                self.counter["skip.unique"] += 1 +                self.counter[Reason.SINGULAR_CLUSTER] += 1                  continue              if len(vs) > self.max_cluster_size: -                self.counter["skip.too_large"] += 1 +                self.counter[Reason.MAX_CLUSTER_SIZE_EXCEEDED] += 1                  continue              for a, b in itertools.combinations(vs, r=2):                  for re in (a, b):                      container_name = re.get("extra", {}).get("container_name", "") or ""                      if container_name.lower().strip() in CONTAINER_NAME_BLACKLIST: -                        self.counter["skip.container_name_blacklist"] += 1 +                        self.counter[Reason.CONTAINER_NAME_BLACKLIST] += 1                          continue                      if re.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST: -                        self.counter["skip.publisher_blacklist"] += 1 +                        self.counter[Reason.PUBLISHER_BLACKLIST] += 1                          continue                  result, reason = verify(a, b)                  self.counter[reason] += 1 @@ -146,12 +146,12 @@ def verify(a, b):      """      try:          if glom(a, "ext_ids.doi") == glom(b, "ext_ids.doi"): -            return (Status.EXACT, OK.DOI) +            return (Status.EXACT, Reason.DOI)      except PathAccessError:          pass      if a.get("work_id") and a.get("work_id") == b.get("work_id"): -        return (Status.EXACT, OK.WORK_ID) +        return (Status.EXACT, Reason.WORK_ID)      a_title = a.get("title", "")      a_title_lower = a_title.lower() @@ -159,27 +159,27 @@ def verify(a, b):      b_title_lower = b_title.lower()      if len(a_title) < 5: -        return (Status.AMBIGUOUS, Miss.SHORT_TITLE) +        return (Status.AMBIGUOUS, Reason.SHORT_TITLE)      if a_title_lower in TITLE_BLACKLIST: -        return (Status.AMBIGUOUS, Miss.BLACKLISTED) +        return (Status.AMBIGUOUS, Reason.BLACKLISTED)      for fragment in TITLE_FRAGMENT_BLACKLIST:          if fragment in a_title_lower: -            return (Status.AMBIGUOUS, Miss.BLACKLISTED_FRAGMENT) +            return (Status.AMBIGUOUS, Reason.BLACKLISTED_FRAGMENT)      # https://fatcat.wiki/release/rnso2swxzvfonemgzrth3arumi,      # https://fatcat.wiki/release/caxa7qbfqvg3bkgz4nwvapgnvi      if "subject index" in a_title_lower and "subject index" in b_title_lower:          try:              if glom(a, "container_id") != glom(b, "container_id"): -                return (Status.DIFFERENT, Miss.CONTAINER) +                return (Status.DIFFERENT, Reason.CONTAINER)          except PathAccessError:              pass      try:          if a_title and a_title == b_title and glom(a, "extra.datacite.metadataVersion") != glom(                  b, "extra.datacite.metadataVersion"): -            return (Status.EXACT, OK.DATACITE_VERSION) +            return (Status.EXACT, Reason.DATACITE_VERSION)      except PathAccessError:          pass @@ -191,7 +191,7 @@ def verify(a, b):              # https://fatcat.wiki/release/63g4ukdxajcqhdytqla6du3t3u,              # https://fatcat.wiki/release/rz72bzfevzeofdeb342c6z45qu;              # https://api.datacite.org/application/vnd.datacite.datacite+json/10.14288/1.0011045 -            return (Status.DIFFERENT, Miss.CUSTOM_PREFIX_10_14288) +            return (Status.DIFFERENT, Reason.CUSTOM_PREFIX_10_14288)      except PathAccessError:          pass @@ -200,7 +200,7 @@ def verify(a, b):          b_doi = glom(b, "ext_ids.doi")          if has_doi_prefix(a_doi, "10.3403") and has_doi_prefix(b_doi, "10.3403"):              if a_doi + "u" == b_doi or b_doi + "u" == a_doi: -                return (Status.STRONG, OK.CUSTOM_BSI_UNDATED) +                return (Status.STRONG, Reason.CUSTOM_BSI_UNDATED)              # Reference to subdocument.              # https://api.fatcat.wiki/v0/release/tcro5wr6brhqnf5wettyiauw34              # https://api.fatcat.wiki/v0/release/s7a4o5v5gfg4tbzna6poyg7nzy @@ -208,7 +208,7 @@ def verify(a, b):                                          and not dict_key_exists(b, "extra.subtitle")) or                                         (dict_key_exists(b, "extra.subtitle")                                          and not dict_key_exists(a, "extra.subtitle"))): -                return (Status.STRONG, OK.CUSTOM_BSI_SUBDOC) +                return (Status.STRONG, Reason.CUSTOM_BSI_SUBDOC)      except PathAccessError:          pass @@ -218,15 +218,15 @@ def verify(a, b):          if has_doi_prefix(a_doi, "10.1149") and has_doi_prefix(b_doi, "10.1149"):              if (a_doi.startswith("10.1149/ma") and not b_doi.startswith("10.1149/ma")                      or b_doi.startswith("10.1149/ma") and not a_doi.startswith("10.1149/ma")): -                return (Status.DIFFERENT, Miss.CUSTOM_IOP_MA_PATTERN) +                return (Status.DIFFERENT, Reason.CUSTOM_IOP_MA_PATTERN)      except PathAccessError:          pass      if "Zweckverband Volkshochschule " in a_title and a_title != b_title: -        return (Status.DIFFERENT, Miss.CUSTOM_VHS) +        return (Status.DIFFERENT, Reason.CUSTOM_VHS)      if re.match(r"appendix ?[^ ]*$", a_title_lower): -        return (Status.AMBIGUOUS, Miss.APPENDIX) +        return (Status.AMBIGUOUS, Reason.APPENDIX)      try:          # TODO: figshare versions, "xxx.v1" @@ -236,7 +236,7 @@ def verify(a, b):              a_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(a, "ext_ids.doi"))              b_doi_v_stripped = re.sub(r"[.]v[0-9]+$", "", glom(b, "ext_ids.doi"))              if a_doi_v_stripped == b_doi_v_stripped: -                return (Status.STRONG, OK.FIGSHARE_VERSION) +                return (Status.STRONG, Reason.FIGSHARE_VERSION)      except PathAccessError:          pass @@ -247,7 +247,7 @@ def verify(a, b):          b_doi = glom(b, "ext_ids.doi")          versioned_doi_pattern = '10[.].*/v[0-9]{1,}$'          if re.match(versioned_doi_pattern, a_doi) and re.match(versioned_doi_pattern, b_doi): -            return (Status.STRONG, OK.VERSIONED_DOI) +            return (Status.STRONG, Reason.VERSIONED_DOI)      except PathAccessError:          pass @@ -258,7 +258,7 @@ def verify(a, b):          a_doi = glom(a, "ext_ids.doi")          b_doi = glom(b, "ext_ids.doi")          if a_doi.split(".")[:-1] == b_doi.split(".") or a_doi.split(".") == b_doi.split(".")[:-1]: -            return (Status.STRONG, OK.VERSIONED_DOI) +            return (Status.STRONG, Reason.VERSIONED_DOI)      except PathAccessError:          pass @@ -291,7 +291,7 @@ def verify(a, b):          b_doi_rel = get_datacite_related_doi(b)          try:              if glom(b, "ext_ids.doi") in a_doi_rel or glom(a, "ext_ids.doi") in b_doi_rel: -                return (Status.STRONG, OK.DATACITE_RELATED_ID) +                return (Status.STRONG, Reason.DATACITE_RELATED_ID)          except PathAccessError:              pass @@ -299,7 +299,7 @@ def verify(a, b):          id_a = re.match(r"(.*)v[0-9]{1,}$", glom(a, "ext_ids.arxiv")).group(1)          id_b = re.match(r"(.*)v[0-9]{1,}$", glom(b, "ext_ids.arxiv")).group(1)          if id_a == id_b: -            return (Status.STRONG, OK.ARXIV_VERSION) +            return (Status.STRONG, Reason.ARXIV_VERSION)      except (AttributeError, ValueError, PathAccessError) as exc:          pass @@ -320,31 +320,31 @@ def verify(a, b):                  "paper-conference",              ])              if len(types & ignore_release_types) == 0: -                return (Status.DIFFERENT, Miss.RELEASE_TYPE) +                return (Status.DIFFERENT, Reason.RELEASE_TYPE)              if "dataset" in types and ("article" in types or "article-journal" in types): -                return (Status.DIFFERENT, Miss.RELEASE_TYPE) +                return (Status.DIFFERENT, Reason.RELEASE_TYPE)              if "book" in types and ("article" in types or "article-journal" in types): -                return (Status.DIFFERENT, Miss.RELEASE_TYPE) +                return (Status.DIFFERENT, Reason.RELEASE_TYPE)      except PathAccessError:          pass      try:          if (glom(a, "release_type") == "dataset" and glom(b, "release_type") == "dataset"                  and glom(a, "ext_ids.doi") != glom(b, "ext_ids.doi")): -            return (Status.DIFFERENT, Miss.DATASET_DOI) +            return (Status.DIFFERENT, Reason.DATASET_DOI)      except PathAccessError:          pass      try:          if (glom(a, "release_type") == "chapter" and glom(b, "release_type") == "chapter"                  and glom(a, "extra.container_name") != glom(b, "extra.container_name")): -            return (Status.DIFFERENT, Miss.BOOK_CHAPTER) +            return (Status.DIFFERENT, Reason.BOOK_CHAPTER)      except PathAccessError:          pass      try:          if glom(a, "extra.crossref.type") == "component" and glom(a, "title") != glom(b, "title"): -            return (Status.DIFFERENT, Miss.COMPONENT) +            return (Status.DIFFERENT, Reason.COMPONENT)      except PathAccessError:          pass @@ -353,7 +353,7 @@ def verify(a, b):              a_doi = glom(a, "ext_ids.doi")              b_doi = glom(b, "ext_ids.doi")              if a_doi != b_doi: -                return (Status.DIFFERENT, Miss.COMPONENT) +                return (Status.DIFFERENT, Reason.COMPONENT)      except PathAccessError:          pass @@ -367,7 +367,7 @@ def verify(a, b):          a_year = a.get("release_year")          b_year = b.get("release_year")          if a_year and b_year and abs(a_year - b_year) > 40: -            return (Status.DIFFERENT, Miss.YEAR) +            return (Status.DIFFERENT, Reason.YEAR)      if a_slug_title == b_slug_title:          # via: https://fatcat.wiki/release/ij3yuoh6lrh3tkrv5o7gfk6yyi @@ -376,7 +376,7 @@ def verify(a, b):              try:                  if (glom(a, "ext_ids.doi").split("/")[0] == "10.1109"                          and glom(b, "ext_ids.arxiv") != ""): -                    return (Status.STRONG, OK.CUSTOM_IEEE_ARXIV) +                    return (Status.STRONG, Reason.CUSTOM_IEEE_ARXIV)              except PathAccessError:                  pass @@ -396,7 +396,7 @@ def verify(a, b):              a_doi = glom(a, "ext_ids.doi")              b_doi = glom(b, "ext_ids.doi")              if has_doi_prefix(a_doi, "10.7916") and has_doi_prefix(b_doi, "10.7916"): -                return (Status.AMBIGUOUS, Miss.CUSTOM_PREFIX_10_7916) +                return (Status.AMBIGUOUS, Reason.CUSTOM_PREFIX_10_7916)          except PathAccessError:              pass @@ -407,7 +407,7 @@ def verify(a, b):              for a_sub in a_subtitles:                  for b_sub in b_subtitles:                      if slugify_string(a_sub) != slugify_string(b_sub): -                        return (Status.DIFFERENT, Miss.SUBTITLE) +                        return (Status.DIFFERENT, Reason.SUBTITLE)          except PathAccessError:              pass @@ -429,54 +429,54 @@ def verify(a, b):              # year; compromise allow a small gap              if a_release_year and b_release_year and abs(int(a_release_year) -                                                           int(b_release_year)) > 1: -                return (Status.DIFFERENT, Miss.YEAR) -            return (Status.EXACT, OK.TITLE_AUTHOR_MATCH) +                return (Status.DIFFERENT, Reason.YEAR) +            return (Status.EXACT, Reason.TITLE_AUTHOR_MATCH)      if (len(a.get("title", "").split()) == 1 and re.match(r".*[.][a-z]{2,3}", a.get("title", ""))              or len(b.get("title", "").split()) == 1              and re.match(r".*[.][a-z]{2,3}$", b.get("title", ""))):          if a.get("title") != b.get("title"): -            return (Status.DIFFERENT, Miss.TITLE_FILENAME) +            return (Status.DIFFERENT, Reason.TITLE_FILENAME)      if a.get("title") and a.get("title") == b.get("title"):          if a_release_year and b_release_year:              if abs(int(a_release_year) - int(b_release_year)) > 2: -                return (Status.DIFFERENT, Miss.YEAR) +                return (Status.DIFFERENT, Reason.YEAR)      if contains_chemical_formula(a_slug_title) or contains_chemical_formula(b_slug_title) and (              a_slug_title != b_slug_title): -        return (Status.DIFFERENT, Miss.CHEM_FORMULA) +        return (Status.DIFFERENT, Reason.CHEM_FORMULA)      if len(a_slug_title) < 10 and a_slug_title != b_slug_title: -        return (Status.AMBIGUOUS, Miss.SHORT_TITLE) +        return (Status.AMBIGUOUS, Reason.SHORT_TITLE)      if re.search(r'\d+', a_slug_title) and a_slug_title != b_slug_title and num_project(              a_slug_title) == num_project(b_slug_title): -        return (Status.DIFFERENT, Miss.NUM_DIFF) +        return (Status.DIFFERENT, Reason.NUM_DIFF)      if a_slug_title and b_slug_title and a_slug_title == b_slug_title:          if a_authors and len(a_authors & b_authors) > 0:              if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None: -                return (Status.STRONG, OK.PREPRINT_PUBLISHED) +                return (Status.STRONG, Reason.PREPRINT_PUBLISHED)      if a_slug_title and b_slug_title and a_slug_title.strip().replace(              " ", "") == b_slug_title.strip().replace(" ", ""):          if len(a_slug_authors & b_slug_authors) > 0: -            return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH) +            return (Status.STRONG, Reason.SLUG_TITLE_AUTHOR_MATCH)      # if any([a_authors, b_authors]) and not (a_authors and b_authors):      # Does not cover case, where both authors are empty.      if a_release_year == b_release_year and a_title_lower == b_title_lower:          if ((dict_key_exists(a, "ext_ids.pmid") and dict_key_exists(b, "ext_ids.doi"))                  or (dict_key_exists(b, "ext_ids.pmid") and dict_key_exists(a, "ext_ids.doi"))): -            return (Status.STRONG, OK.PMID_DOI_PAIR) +            return (Status.STRONG, Reason.PMID_DOI_PAIR)      # Two JSTOR items will probably be different.      try:          a_jstor_id = glom(a, "ext_ids.jstor")          b_jstor_id = glom(b, "ext_ids.jstor")          if a_jstor_id != b_jstor_id: -            return (Status.DIFFERENT, Miss.JSTOR_ID) +            return (Status.DIFFERENT, Reason.JSTOR_ID)      except PathAccessError:          pass @@ -490,7 +490,7 @@ def verify(a, b):          if a_container_id == b_container_id and a_doi != b_doi and not has_doi_prefix(                  a_doi, "10.1126"): -            return (Status.DIFFERENT, Miss.SHARED_DOI_PREFIX) +            return (Status.DIFFERENT, Reason.SHARED_DOI_PREFIX)      except PathAccessError:          pass @@ -515,7 +515,7 @@ def verify(a, b):          if len(top_scores) > 0:              avg_score = sum(top_scores) / len(top_scores)              if avg_score > 0.5: -                return (Status.STRONG, OK.TOKENIZED_AUTHORS) +                return (Status.STRONG, Reason.TOKENIZED_AUTHORS)              else:                  pass                  # Kuidong Xu, Joong Ki Choi, Eun Jin Yang, Kyu Chul Lee, Yanli Lei @@ -530,19 +530,19 @@ def verify(a, b):          a_tok = [tok for tok in re.findall(r"[\w]{3,}", " ".join(a_slug_authors)) if tok]          b_tok = [tok for tok in re.findall(r"[\w]{3,}", " ".join(b_slug_authors)) if tok]          if jaccard(set(a_tok), set(b_tok)) > 0.35: -            return (Status.STRONG, OK.JACCARD_AUTHORS) +            return (Status.STRONG, Reason.JACCARD_AUTHORS)          # TODO: This misses spelling differences, e.g.          # https://fatcat.wiki/release/7nbcgsohrrak5cuyk6dnit6ega and          # https://fatcat.wiki/release/q66xv7drk5fnph7enwwlkyuwqm -        return (Status.DIFFERENT, Miss.CONTRIB_INTERSECTION_EMPTY) +        return (Status.DIFFERENT, Reason.CONTRIB_INTERSECTION_EMPTY)      # mark choicereview articles as ambiguous, as they seem to be behind a paywall      try:          a_doi = glom(a, "ext_ids.doi")          b_doi = glom(b, "ext_ids.doi")          if has_doi_prefix(a_doi, "10.5860") or has_doi_prefix(b_doi, "10.5860"): -            return (Status.AMBIGUOUS, Miss.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW) +            return (Status.AMBIGUOUS, Reason.CUSTOM_PREFIX_10_5860_CHOICE_REVIEW)      except PathAccessError:          pass @@ -562,8 +562,8 @@ def verify(a, b):              b_num_pages = int(b_end) - int(b_start)              if a_num_pages >= 0 and b_num_pages >= 0:                  if abs(a_num_pages - b_num_pages) > 5: -                    return (Status.DIFFERENT, Miss.PAGE_COUNT) +                    return (Status.DIFFERENT, Reason.PAGE_COUNT)      except PathAccessError:          pass -    return (Status.AMBIGUOUS, OK.DUMMY) +    return (Status.AMBIGUOUS, Reason.DUMMY) | 
