1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
"""
Clustering stage.
"""
import functools
import operator
import re
import sys
import fuzzy
__all__ = [
"release_key_title",
"release_key_title_normalized",
"release_key_title_nysiis",
"sort_file_by_column",
]
get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans("\t", " ", "\n", " ")
non_word_re = re.compile('[\W_]+', re.UNICODE)
def cut(value, f=0, sep='\t'):
"""
Split value by separator and return a single column.
"""
return value.split(sep)[f]
def release_key_title(re):
id, title = get_ident_title(re)
if not title:
raise ValueError('title missing')
title = title.translate(ws_replacer).strip()
return (id, title)
def release_key_title_normalized(re):
id, title = release_key_title(re)
return (id, non_word_re.sub('', title))
def release_key_title_nysiis(re):
id, title = release_key_title(re)
return (id, fuzzy.nysiis(title))
def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-"):
"""
Sort tabular file with sort(1), returns the filename of the sorted file.
TODO: use separate /fast/tmp for sort.
"""
with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf:
env = os.environ.copy()
if fast:
env["LC_ALL"] = "C"
subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env)
return tf.name
def group_by(filename, key=None, value=None, comment=""):
with open(filename) as f:
for k, g in itertools.groupby(f, key=key):
doc = {
"k": k.strip(),
"v": [value(v) for v in g],
"c": comment,
}
yield doc
class Cluster:
"""
Cluster scaffold for release entities.
"""
def __init__(self, files=None, output=None, keyfunc=lambda v: v, tmp_prefix='fuzzycat-'):
self.files = files
self.tmp_prefix = tmp_prefix
self.keyfunc = keyfunc
self.output = output
if self.output is None:
self.output = sys.stdout
def run(self):
with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.tmp_prefix) as tf:
for line in fileinput.input(files=files):
try:
id, key = self.keyfunc(json.loads(line))
except (KeyError, ValueError):
continue
else:
print("{}\t{}".format(id, key), file=tf)
sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.tmp_prefix)
for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__):
json.dump(doc, self.output)
os.remove(sbc)
os.remove(tf.name)
|