aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-17 02:32:42 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-17 02:32:42 +0100
commit8d708f337c8abbec61229f023637ddebcee827b5 (patch)
treea975875f6a48f1c0b90d6656383e89bf64e22f4a
parentfffe4b0dbae944bd7d2a3a3fefc30d6dab7daf4c (diff)
downloadfuzzycat-8d708f337c8abbec61229f023637ddebcee827b5.tar.gz
fuzzycat-8d708f337c8abbec61229f023637ddebcee827b5.zip
update blacklists
-rw-r--r--fuzzycat/verify.py24
1 files changed, 22 insertions, 2 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index d277000..9af70e4 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -39,6 +39,8 @@ get_key_values = operator.itemgetter("k", "v")
# There titles appear too often, so ignore them for now.
TITLE_BLACKLIST = set([
+ "annual meeting",
+ "an invitation to membership",
"",
":{unav)",
"[others]",
@@ -63,6 +65,7 @@ TITLE_BLACKLIST = set([
"acknowledgments",
"actualités",
"agradecimento",
+ "all pdfs of this category",
"announcement",
"announcements",
"annual report",
@@ -73,6 +76,7 @@ TITLE_BLACKLIST = set([
"author response image 1. author response",
"back matter",
"backmatter",
+ "bericht",
"bibliography",
"book review",
"book reviews",
@@ -95,6 +99,7 @@ TITLE_BLACKLIST = set([
"discussion",
"editorial board",
"editorial",
+ "educators personally",
"einleitung",
"erratum",
"foreword",
@@ -103,15 +108,20 @@ TITLE_BLACKLIST = set([
"frontmatter",
"fundraising",
"gbif occurrence download",
+ "general medical council",
"in this issue",
+ "index des auteurs",
"index",
"inhalt",
+ "inhalt-impressum",
+ "inhalt.impressum",
"interlude",
"introduction",
"issue information",
"letter to the editor",
"letters to the editor",
"list of delegates",
+ "map projections",
"masthead",
"methotrexate",
"miscellany",
@@ -128,9 +138,11 @@ TITLE_BLACKLIST = set([
"preface",
"preliminary material",
"preservation image",
+ "production",
"references",
"regulations",
"reply",
+ "research items",
"reviews of books",
"reviews",
"short notices",
@@ -138,6 +150,7 @@ TITLE_BLACKLIST = set([
"subject index",
"table of contents",
"taxonomic abstract for the species.",
+ "thank you",
"the applause data release 2",
"奥付",
"投稿規定",
@@ -189,6 +202,7 @@ class Miss(str, Enum):
CUSTOM_VHS = 'miss.vhs' # https://fatcat.wiki/release/44gk5ben5vghljq6twm7lwmxla
NUM_DIFF = 'miss.num_diff'
DATASET_DOI = 'miss.dataset_doi'
+ RELEASE_TYPE = 'miss.release_type'
class GroupVerifier:
"""
@@ -249,8 +263,11 @@ def compare(a, b):
if "Zweckverband Volkshochschule " in a.get("title") and a.get("title") != b.get("title"):
return (Status.DIFFERENT, Miss.CUSTOM_VHS)
- if (a.get("extra", {}).get("crossref", {}).get("type", {}) == "dataset" and
- b.get("extra", {}).get("crossref", {}).get("type", {}) == "dataset"):
+ if a.get("release_type") and b.get("release_type") and a.get("release_type") != b.get("release_type"):
+ return (Status.DIFFERENT, Miss.RELEASE_TYPE)
+
+ if (a.get("release_type") == "dataset" and
+ b.get("release_type") == "dataset"):
if (a.get("ext_ids", {}).get("doi") and b.get("ext_ids", {}).get("doi") and
a.get("ext_ids", {}).get("doi") != b.get("ext_ids", {}).get("doi")):
return (Status.DIFFERENT, Miss.DATASET_DOI)
@@ -280,6 +297,9 @@ def compare(a, b):
a_slug_title = slugify_string(a.get("title", "")).replace("\n", " ")
b_slug_title = slugify_string(b.get("title", "")).replace("\n", " ")
+ if len(a_slug_title) < 10 and a_slug_title != b_slug_title:
+ return (Status.AMBIGUOUS, Miss.SHORT_TITLE)
+
if re.search(r'\d', a_slug_title) and a_slug_title != b_slug_title and num_project(
a_slug_title) == num_project(b_slug_title):
return (Status.DIFFERENT, Miss.NUM_DIFF)