1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
|
import io
import itertools
import re
import string
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
# More correct: https://www.johndcook.com/blog/2016/02/04/regular-expression-to-match-a-chemical-element/
CHEM_FORMULA = re.compile(r"([A-Z]{1,2}[0-9]{1,2})+")
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
"""
return ''.join((c for c in s.lower() if c in printable_no_punct))
def cut(f: int = 0, sep: str = '\t', ignore_missing_column: bool = True):
"""
Return a callable that extracts a given column from a line.
"""
def func(value):
parts = value.strip().split(sep)
if f >= len(parts):
if ignore_missing_column:
return ""
raise ValueError('cannot split value {} into {} parts'.format(value, f))
return parts[f]
return func
def author_similarity_score(u, v):
"""
Given two author strings, return a similarity score between 0 and 1.
"""
return jaccard(set(token_n_grams(u)), set(token_n_grams(v)))
def jaccard(a, b):
"""
Jaccard of sets a and b.
"""
if len(a | b) == 0:
return 0
return len(a & b) / len(a | b)
def token_n_grams(s, n=2):
"""
Return n-grams, calculated per token.
"""
return ["".join(v) for v in itertools.chain(*[nwise(v, n=n) for v in tokenize_string(s)])]
def tokenize_string(s):
"""
Normalize and tokenize, should be broken up.
"""
return [token for token in s.lower().split()]
def nwise(iterable, n=2):
"""
Generalized: func: `pairwise`. Split an iterable after every
`n` items.
"""
i = iter(iterable)
piece = tuple(itertools.islice(i, n))
while piece:
yield piece
piece = tuple(itertools.islice(i, n))
def num_project(s):
"""
Cf. https://fatcat.wiki/release/6b5yupd7bfcw7gp73hjoavbgfq,
https://fatcat.wiki/release/7hgzqz3hrngq7omtwdxz4qx34u
Unify every occurence of a digit (or group of digits).
"""
return re.sub(r'\d+', '<NUM>', s)
def contains_chemical_formula(s):
"""
Returns true, if we find C3H8O or the like in title.
"""
for token in s.split():
if CHEM_FORMULA.search(token):
return True
|