diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-10 18:52:19 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-10 18:52:19 -0800 |
commit | b600085ecd905fd1f0d053c8acf13462f3916ea7 (patch) | |
tree | 8486cf91ab833048f968cfdca326ce991ea1b6e0 /python | |
parent | 1dfc5a0d6b58f6fae02237f1430f5666bbfc105f (diff) | |
download | sandcrawler-b600085ecd905fd1f0d053c8acf13462f3916ea7.tar.gz sandcrawler-b600085ecd905fd1f0d053c8acf13462f3916ea7.zip |
blacklist -> denylist
Diffstat (limited to 'python')
-rwxr-xr-x | python/scripts/filter_grobid_metadata.py | 18 | ||||
l--------- | python/title_slug_denylist.txt (renamed from python/title_slug_blacklist.txt) | 0 |
2 files changed, 9 insertions, 9 deletions
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py index c33ab86..dc4bea7 100755 --- a/python/scripts/filter_grobid_metadata.py +++ b/python/scripts/filter_grobid_metadata.py @@ -3,10 +3,10 @@ import sys import json -with open('title_slug_blacklist.txt', 'r') as f: - TITLE_BLACKLIST = [l.strip() for l in f] +with open('title_slug_denylist.txt', 'r') as f: + TITLE_DENYLIST = [l.strip() for l in f] -TITLE_BLACKLIST.extend(( +TITLE_DENYLIST.extend(( 'editorial', 'advertisement', 'bookreviews', @@ -19,7 +19,7 @@ TITLE_BLACKLIST.extend(( )) # The full name can't *entirely* be one of these -NAME_BLACKLIST = ( +NAME_DENYLIST = ( 'phd', 'phdstudent', ) @@ -37,7 +37,7 @@ def tokenize(s, remove_whitespace=True): return s.encode('ascii', 'replace').decode('utf8').replace('?', '') assert tokenize("Impact Factor: 2.114") == "impactfactor" -assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST +assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST def filter_title(title): @@ -45,7 +45,7 @@ def filter_title(title): if len(title) > 500: return None title_slug = tokenize(title, remove_whitespace=True) - if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST: + if len(title_slug) < 10 or title_slug in TITLE_DENYLIST: return None if title_slug.startswith('nr'): return None @@ -85,7 +85,7 @@ def filter_title(title): def filter_author_name(name): name = name['name'] - if name.strip().lower().replace(' ', '') in NAME_BLACKLIST: + if name.strip().lower().replace(' ', '') in NAME_DENYLIST: return None return ' '.join([t for t in name.split() if tokenize(t)]) @@ -97,12 +97,12 @@ def filter_refs(l): return l def filter_journal_name(name): - # same blacklist, for now + # same denylist, for now if not name: return None name = name.replace(' e-ISSN', '').replace(' p-ISSN', '') slug_name = tokenize(name) - if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º": + if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º": return None for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "): if name.startswith(prefix): diff --git a/python/title_slug_blacklist.txt b/python/title_slug_denylist.txt index 5bca386..5bca386 120000 --- a/python/title_slug_blacklist.txt +++ b/python/title_slug_denylist.txt |