From b600085ecd905fd1f0d053c8acf13462f3916ea7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 10 Nov 2020 18:52:19 -0800 Subject: blacklist -> denylist --- python/scripts/filter_grobid_metadata.py | 18 +++++++++--------- python/title_slug_blacklist.txt | 1 - python/title_slug_denylist.txt | 1 + 3 files changed, 10 insertions(+), 10 deletions(-) delete mode 120000 python/title_slug_blacklist.txt create mode 120000 python/title_slug_denylist.txt diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index c33ab86..dc4bea7 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -3,10 +3,10 @@ import sys import json -with open('title_slug_blacklist.txt', 'r') as f: - TITLE_BLACKLIST = [l.strip() for l in f] +with open('title_slug_denylist.txt', 'r') as f: + TITLE_DENYLIST = [l.strip() for l in f] -TITLE_BLACKLIST.extend(( +TITLE_DENYLIST.extend(( 'editorial', 'advertisement', 'bookreviews', @@ -19,7 +19,7 @@ TITLE_BLACKLIST.extend(( )) # The full name can't *entirely* be one of these -NAME_BLACKLIST = ( +NAME_DENYLIST = ( 'phd', 'phdstudent', ) @@ -37,7 +37,7 @@ def tokenize(s, remove_whitespace=True): return s.encode('ascii', 'replace').decode('utf8').replace('?', '') assert tokenize("Impact Factor: 2.114") == "impactfactor" -assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST +assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST def filter_title(title): @@ -45,7 +45,7 @@ def filter_title(title): if len(title) > 500: return None title_slug = tokenize(title, remove_whitespace=True) - if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: + if len(title_slug) < 10 or title_slug in TITLE_DENYLIST: return None if title_slug.startswith('nr'): return None @@ -85,7 +85,7 @@ def filter_title(title): def filter_author_name(name): name = name['name'] - if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: + if name.strip().lower().replace(' ', '') in NAME_DENYLIST: return None return ' '.join([t for t in name.split() if tokenize(t)]) @@ -97,12 +97,12 @@ def filter_refs(l): return l def filter_journal_name(name): - # same blacklist, for now + # same denylist, for now if not name: return None name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') slug_name = tokenize(name) - if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": + if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": return None for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): if name.startswith(prefix): diff --git a/python/title_slug_blacklist.txt b/python/title_slug_blacklist.txt deleted file mode 120000 index 5bca386..0000000 --- a/python/title_slug_blacklist.txt +++ /dev/null @@ -1 +0,0 @@ -../scalding/src/main/resources/slug-denylist.txt \ No newline at end of file diff --git a/python/title_slug_denylist.txt b/python/title_slug_denylist.txt new file mode 120000 index 0000000..5bca386 --- /dev/null +++ b/python/title_slug_denylist.txt @@ -0,0 +1 @@ +../scalding/src/main/resources/slug-denylist.txt \ No newline at end of file -- cgit v1.2.3