aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2020-11-16 19:55:39 +0100
committerMartin Czygan <martin.czygan@gmail.com>2020-11-16 19:55:39 +0100
commitf5b403704e1385639904cd1ca6d4e4af8cdbb359 (patch)
tree1f6ef948598488ee9f3635b98dc4ccc757645a6c
parent6855a687a37c0a009eb6fbb2c82ded49304452de (diff)
downloadfuzzycat-f5b403704e1385639904cd1ca6d4e4af8cdbb359.tar.gz
fuzzycat-f5b403704e1385639904cd1ca6d4e4af8cdbb359.zip
update blacklist
-rw-r--r--fuzzycat/verify.py47
1 files changed, 39 insertions, 8 deletions
diff --git a/fuzzycat/verify.py b/fuzzycat/verify.py
index 81731c4..b3743aa 100644
--- a/fuzzycat/verify.py
+++ b/fuzzycat/verify.py
@@ -41,14 +41,20 @@ get_key_values = operator.itemgetter("k", "v")
TITLE_BLACKLIST = set([
"",
":{unav)",
- "Positions Available",
+ "Abbildungsnachweis",
"[others]",
"[s.n.]",
"a correction",
"abbildung",
"abbreviations and acronyms",
+ "about the cover",
+ "about the editor",
+ "about the editors",
"about this issue",
"about this journal",
+ "about this title",
+ "abréviations",
+ "abstracts of papers to appear in future issues",
"abstracts",
"acknowledgement of reviewers",
"acknowledgements to reviewers",
@@ -59,15 +65,21 @@ TITLE_BLACKLIST = set([
"agradecimento",
"announcement",
"announcements",
+ "around the world",
"arthrobacter sp.",
+ "aufgaben",
"author index",
+ "author response image 1. author response",
"back matter",
"backmatter",
"bibliography",
"book review",
"book reviews",
"books received",
+ "bookseller's catalogue",
"calendar",
+ "canto",
+ "canto",
"conclusion",
"conclusions",
"contents",
@@ -87,22 +99,30 @@ TITLE_BLACKLIST = set([
"front cover",
"front matter",
"frontmatter",
+ "fundraising",
"gbif occurrence download",
"in this issue",
"index",
"inhalt",
+ "interlude",
"introduction",
"issue information",
"letter to the editor",
"letters to the editor",
+ "list of delegates",
"masthead",
+ "methotrexate",
"miscellany",
+ "news section",
"news",
"not available",
+ "note of appreciation / note de reconnaissance",
"notes",
"occurrence download",
"oup accepted manuscript",
+ "parliamentary intelligence",
"petitions.xlsx",
+ "positions available",
"preface",
"preliminary material",
"preservation image",
@@ -151,6 +171,7 @@ class OK(str, Enum):
DUMMY = 'ok.dummy'
TITLE_AUTHOR_MATCH = 'ok.title_author_match'
PREPRINT_PUBLISHED = 'ok.preprint_published'
+ SLUG_TITLE_AUTHOR_MATCH = 'ok.slug_title_author_match'
class Miss(str, Enum):
@@ -189,12 +210,6 @@ class GroupVerifier:
if not line:
continue
doc = json.loads(line)
- if doc.get("extra", {}).get("container_name", "").lower() in CONTAINER_NAME_BLACKLIST:
- self.counter["skip.container_name_blacklist"] += 1
- continue
- if doc.get("publisher", "").lower() in PUBLISHER_BLACKLIST:
- self.counter["skip.publisher_blacklist"] += 1
- continue
k, vs = get_key_values(doc)
if len(vs) < 2:
self.counter["skip.unique"] += 1
@@ -203,6 +218,18 @@ class GroupVerifier:
self.counter["skip.too_large"] += 1
continue
for a, b in itertools.combinations(vs, r=2):
+ if a.get("extra", {}).get("container_name", "").lower().strip() in CONTAINER_NAME_BLACKLIST:
+ self.counter["skip.container_name_blacklist"] += 1
+ continue
+ if b.get("extra", {}).get("container_name", "").lower().strip() in CONTAINER_NAME_BLACKLIST:
+ self.counter["skip.container_name_blacklist"] += 1
+ continue
+ if a.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST:
+ self.counter["skip.publisher_blacklist"] += 1
+ continue
+ if b.get("publisher", "").lower().strip() in PUBLISHER_BLACKLIST:
+ self.counter["skip.publisher_blacklist"] += 1
+ continue
result, reason = compare(a, b)
self.counter[reason] += 1
print("https://fatcat.wiki/release/{}".format(a["ident"]),
@@ -233,7 +260,7 @@ def compare(a, b):
a_release_year = a.get("release_year")
b_release_year = b.get("release_year")
- if a.get("title") == b.get("title"):
+ if a.get("title", "").lower() == b.get("title", "").lower():
if a_authors and (a_authors == b_authors):
if a_release_year and b_release_year and a_release_year != b_release_year:
return (Status.DIFFERENT, Miss.YEAR)
@@ -256,6 +283,10 @@ def compare(a, b):
if arxiv_id_a is not None and arxiv_id_b is None or arxiv_id_a is None and arxiv_id_b is not None:
return (Status.STRONG, OK.PREPRINT_PUBLISHED)
+ if a_slug_title and b_slug_title and a_slug_title.strip().replace(" ", "") == b_slug_title.strip().replace(" ", ""):
+ if len(a_slug_authors & b_slug_authors) > 0:
+ return (Status.STRONG, OK.SLUG_TITLE_AUTHOR_MATCH)
+
arxiv_id_a = a.get("ext_ids", {}).get("arxiv")
arxiv_id_b = b.get("ext_ids", {}).get("arxiv")
if arxiv_id_a and arxiv_id_b: