1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
#!/usr/bin/env python3
import sys
import json
with open('title_slug_blacklist.txt', 'r') as f:
TITLE_BLACKLIST = [l.strip() for l in f]
TITLE_BLACKLIST.extend((
'editorial',
'advertisement',
'bookreviews',
'reviews',
'nr',
'abstractoriginalarticle',
'originalarticle',
'impactfactor',
'articlenumber',
))
# The full name can't *entirely* be one of these
NAME_BLACKLIST = (
'phd',
'phdstudent',
)
def tokenize(s, remove_whitespace=True):
s.replace(''', "'")
# Remove non-alphanumeric characters
s = ''.join([c for c in s.lower() if c.isalpha() or c.isspace()])
if remove_whitespace:
s = ''.join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
return s.encode('ascii', 'replace').decode('utf8').replace('?', '')
assert tokenize("Impact Factor: 2.114") == "impactfactor"
assert tokenize("Impact Factor: 2.114") in TITLE_BLACKLIST
def filter_title(title):
title = title.strip()
if len(title) > 500:
return None
title_slug = tokenize(title, remove_whitespace=True)
if len(title_slug) < 10 or title_slug in TITLE_BLACKLIST:
return None
if title_slug.startswith('nr'):
return None
if title.lower().replace('.', '').startswith('int j '):
return None
for prefix in ("Title: ", "Original Article: ", "Article: ", "Original Article "):
if title.startswith(prefix):
title.replace(prefix, '')
if title.startswith("The Journal of "):
return None
if "volume" in title_slug and "issue" in title_slug:
return None
if "downloadedfrom" in title_slug:
return None
if title_slug.startswith("issn"):
return None
# titles with too many or too few words in title
title_words = len(title.split())
if title_words > 50 or title_words < 2:
return None
# titles with spaces between every letter (more than N such single-char words)
if len([True for w in title.split() if len(w) == 1]) > 12:
return None
# too deep subtitling/splitting
if title.count(':') > 3 or title.count('|') > 1 or title.count('.') > 1:
return None
return title
def filter_author_name(name):
name = name['name']
if name.strip().lower().replace(' ', '') in NAME_BLACKLIST:
return None
return ' '.join([t for t in name.split() if tokenize(t)])
def filter_authors(l):
return [dict(name=n) for n in map(filter_author_name, l) if n and len(n) > 1]
def filter_refs(l):
# TODO:
return l
def filter_journal_name(name):
# same blacklist, for now
if not name:
return None
name = name.replace(' e-ISSN', '').replace(' p-ISSN', '')
slug_name = tokenize(name)
if slug_name in TITLE_BLACKLIST or len(slug_name) < 4 or name == "N.º":
return None
for prefix in ("/ ", "~ ", "& ", "© ", "Original Research Article ", "Original Article ", "Research Article ", "Available online www.jocpr.com "):
if name.startswith(prefix):
name = name.replace(prefix, '')
for suffix in (" Available online at www.sciarena.com", " Original Article", " Available online at", " ISSN", " ISSUE"):
if name.endswith(suffix):
name = name.replace(suffix, '')
if "====================" in name:
return None
if len(name) > 150:
return None
return ' '.join(name.split())
def filter_metadata(obj):
if not (obj.get('title') and obj.get('authors')):
return None
title = filter_title(obj['title'])
if not title:
#sys.stderr.write("bad title\n")
return None
else:
obj['title'] = title
obj['authors'] = filter_authors(obj['authors'])
obj['citations'] = filter_refs(obj['citations'])
obj['journal']['name'] = filter_journal_name(obj['journal']['name'])
return obj
def run(invert=False):
for line in sys.stdin:
fields = line.split('\t')
if len(fields) == 5:
raw = fields[4]
elif len(fields) == 1:
raw = fields[0]
else:
sys.stderr.write("bad line\n")
continue
obj = json.loads(raw)
processed = filter_metadata(obj)
if processed:
if not invert:
processed = json.dumps(processed)
if len(fields) == 5:
fields[4] = processed
else:
fields[0] = processed
print('\t'.join(fields))
elif invert:
print(raw.strip())
if __name__=="__main__":
run(invert="--invert" in sys.argv)
|