diff options
| -rw-r--r-- | fuzzycat/verify.py | 24 | 
1 files changed, 22 insertions, 2 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py index d277000..9af70e4 100644 --- a/fuzzycat/verify.py +++ b/fuzzycat/verify.py @@ -39,6 +39,8 @@ get_key_values = operator.itemgetter("k", "v")  # There titles appear too often, so ignore them for now.  TITLE_BLACKLIST = set([ +    "annual meeting", +    "an invitation to membership",      "",      ":{unav)",      "[others]", @@ -63,6 +65,7 @@ TITLE_BLACKLIST = set([      "acknowledgments",      "actualités",      "agradecimento", +    "all pdfs of this category",      "announcement",      "announcements",      "annual report", @@ -73,6 +76,7 @@ TITLE_BLACKLIST = set([      "author response image 1. author response",      "back matter",      "backmatter", +    "bericht",      "bibliography",      "book review",      "book reviews", @@ -95,6 +99,7 @@ TITLE_BLACKLIST = set([      "discussion",      "editorial board",      "editorial", +    "educators personally",      "einleitung",      "erratum",      "foreword", @@ -103,15 +108,20 @@ TITLE_BLACKLIST = set([      "frontmatter",      "fundraising",      "gbif occurrence download", +    "general medical council",      "in this issue", +    "index des auteurs",      "index",      "inhalt", +    "inhalt-impressum", +    "inhalt.impressum",      "interlude",      "introduction",      "issue information",      "letter to the editor",      "letters to the editor",      "list of delegates", +    "map projections",      "masthead",      "methotrexate",      "miscellany", @@ -128,9 +138,11 @@ TITLE_BLACKLIST = set([      "preface",      "preliminary material",      "preservation image", +    "production",      "references",      "regulations",      "reply", +    "research items",      "reviews of books",      "reviews",      "short notices", @@ -138,6 +150,7 @@ TITLE_BLACKLIST = set([      "subject index",      "table of contents",      "taxonomic abstract for the species.", +    "thank you",      "the applause data release 2",      "奥付",      "投稿規定", @@ -189,6 +202,7 @@ class Miss(str, Enum):      CUSTOM_VHS = 'miss.vhs'  # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla      NUM_DIFF = 'miss.num_diff'      DATASET_DOI = 'miss.dataset_doi' +    RELEASE_TYPE = 'miss.release_type'  class GroupVerifier:      """ @@ -249,8 +263,11 @@ def compare(a, b):      if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):          return (Status.DIFFERENT, Miss.CUSTOM_VHS) -    if (a.get("extra", {}).get("crossref", {}).get("type",  {}) == "dataset" and -        b.get("extra", {}).get("crossref", {}).get("type",  {}) == "dataset"): +    if a.get("release_type") and b.get("release_type") and a.get("release_type") != b.get("release_type"): +        return (Status.DIFFERENT, Miss.RELEASE_TYPE) + +    if (a.get("release_type") == "dataset" and +        b.get("release_type") == "dataset"):          if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and              a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")):              return (Status.DIFFERENT, Miss.DATASET_DOI) @@ -280,6 +297,9 @@ def compare(a, b):      a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")      b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ") +    if len(a_slug_title) < 10 and a_slug_title != b_slug_title: +        return (Status.AMBIGUOUS, Miss.SHORT_TITLE) +      if re.search(r'\d', a_slug_title) and a_slug_title != b_slug_title and num_project(              a_slug_title) == num_project(b_slug_title):          return (Status.DIFFERENT, Miss.NUM_DIFF)  | 
