aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-10 18:52:19 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-10 18:52:19 -0800
commitb600085ecd905fd1f0d053c8acf13462f3916ea7 (patch)
tree8486cf91ab833048f968cfdca326ce991ea1b6e0 /python
parent1dfc5a0d6b58f6fae02237f1430f5666bbfc105f (diff)
downloadsandcrawler-b600085ecd905fd1f0d053c8acf13462f3916ea7.tar.gz
sandcrawler-b600085ecd905fd1f0d053c8acf13462f3916ea7.zip
blacklist -> denylist
Diffstat (limited to 'python')
-rwxr-xr-xpython/scripts/filter_grobid_metadata.py18
l---------python/title_slug_denylist.txt (renamed from python/title_slug_blacklist.txt)0
2 files changed, 9 insertions, 9 deletions
diff --git a/python/scripts/filter_grobid_metadata.py b/python/scripts/filter_grobid_metadata.py
index c33ab86..dc4bea7 100755
--- a/python/scripts/filter_grobid_metadata.py
+++ b/python/scripts/filter_grobid_metadata.py
@@ -3,10 +3,10 @@
import sys
import json
-with open('title_slug_blacklist.txt', 'r') as f:
- TITLE_BLACKLIST = [l.strip() for l in f]
+with open('title_slug_denylist.txt', 'r') as f:
+ TITLE_DENYLIST = [l.strip() for l in f]
-TITLE_BLACKLIST.extend((
+TITLE_DENYLIST.extend((
'editorial',
'advertisement',
'bookreviews',
@@ -19,7 +19,7 @@ TITLE_BLACKLIST.extend((
))
# The full name can't *entirely* be one of these
-NAME_BLACKLIST = (
+NAME_DENYLIST = (
'phd',
'phdstudent',
)
@@ -37,7 +37,7 @@ def tokenize(s, remove_whitespace=True):
return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
assert tokenize("Impact Factor: 2.114") == "impactfactor"
-assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
+assert tokenize("Impact Factor: 2.114") in TITLE_DENYLIST
def filter_title(title):
@@ -45,7 +45,7 @@ def filter_title(title):
if len(title) > 500:
return None
title_slug = tokenize(title, remove_whitespace=True)
- if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
+ if len(title_slug) < 10 or title_slug in TITLE_DENYLIST:
return None
if title_slug.startswith('nr'):
return None
@@ -85,7 +85,7 @@ def filter_title(title):
def filter_author_name(name):
name = name['name']
- if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
+ if name.strip().lower().replace(' ', '') in NAME_DENYLIST:
return None
return ' '.join([t for t in name.split() if tokenize(t)])
@@ -97,12 +97,12 @@ def filter_refs(l):
return l
def filter_journal_name(name):
- # same blacklist, for now
+ # same denylist, for now
if not name:
return None
name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
slug_name = tokenize(name)
- if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
+ if slug_name in TITLE_DENYLIST or len(slug_name) < 4 or name == "N.º":
return None
for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
if name.startswith(prefix):
diff --git a/python/title_slug_blacklist.txt b/python/title_slug_denylist.txt
index 5bca386..5bca386 120000
--- a/python/title_slug_blacklist.txt
+++ b/python/title_slug_denylist.txt