1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
#!/usr/bin/env python3
"""
Filters an input stream of sorted "matchcrossref" scalding job, and outputs
"good enough" matches to be inserted to fatcat.
Currently works on DOI numbers. Filters for a high enough string match (doesn't
re-do title match), and checks author lists. Filters out slugs with too many
matches, and outputs one-line-per-sha1 (aka, file).
No dependencies (only python3 stdlib)
"""
import json
import sys
# out of 1000
score_threshold = 900
max_slug_lines = 10
require_authors = 1
def tokenize(s, remove_whitespace=False):
s.replace(''', "'")
# Remove non-alphanumeric characters
s = ''.join([c for c in s.lower() if c.isalnum() or c.isspace()])
if remove_whitespace:
s = ''.join(s.split())
# Encode as dumb ASCII (TODO: this is horrible)
return s.encode('ascii', 'replace').replace(b'?', b'')
def check_authors(left, right):
"""
Intended to check GROBID extracted authors (right) against "known good"
(but maybe not perfect) Crossref metadata authors ("left").
"""
if not left:
return False
if len(left) > len(right):
return False
right_all = tokenize(" ".join(right))
for i in range(len(left)):
l = left[i].lower().replace('jr.', '').split()
if not l:
return False
l = tokenize(l[-1])
if len(l) <= 1:
# weird author name (single char)
return False
if l not in right_all:
#print("MISSING: {} from {}".format(l.decode('utf8'), right_all.decode('utf8')))
return False
return True
def test_check_authors():
assert not check_authors([], [])
assert not check_authors([], ['one'])
assert check_authors(['one'], ['one'])
assert check_authors(['one two'], ['One Two'])
assert check_authors(['two'], ['One Two'])
assert check_authors(['two'], ['two, one'])
assert check_authors(['mago'], ['Mr. Magoo'])
assert check_authors(['Mr. Magoo'], ['Mr Magoo'])
assert check_authors(['one', 'tw', 'thr'], ['one', 'two', 'three'])
# Rows are (score, grobid, crossref)
def process_group(rows):
if len(rows) > max_slug_lines:
return
keepers = dict()
for row in rows:
score = int(row[0])
if score < score_threshold:
continue
grobid = json.loads(row[1])
crossref = json.loads(row[2])
if not check_authors(crossref['authors'], grobid['authors']):
#print("NO (crossref/grobid): {} {}".format(crossref['authors'], grobid['authors']))
continue
else:
#print("YES: {} {}".format(crossref['authors'], grobid['authors']))
pass
sha1 = grobid['sha1']
doi = crossref['doi'].lower()
l = keepers.get(sha1, list())
l.append(doi)
keepers[sha1] = l
for sha1, doi_list in keepers.items():
print("{}\t{}".format(sha1, json.dumps(doi_list)))
def run():
last_slug = None
lines = []
# group lines by slug, and process in batches
for line in sys.stdin:
line = line.strip().split('\t')
assert len(line) == 4
slug = line[0]
if last_slug and slug != last_slug and lines:
process_group(lines)
lines = []
last_slug = slug
lines.append(line[1:])
# catch any remaining
if lines:
process_group(lines)
if __name__=='__main__':
run()
|