fuzzycat/cluster.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

"""
Clustering stage.
"""

import functools
import operator
import re
import sys

import fuzzy

__all__ = [
    "release_key_title",
    "release_key_title_normalized",
    "release_key_title_nysiis",
    "sort_file_by_column",
    "group_by",
]

get_ident_title = operator.itemgetter("ident", "title")
ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile('[\W_]+', re.UNICODE)

def release_key_title(re):
    id, title = get_ident_title(re)
    if not title:
        raise ValueError('title missing')
    title = title.translate(ws_replacer).strip()
    return (id, title)

def release_key_title_normalized(re):
    id, title = release_key_title(re)
    return (id, non_word_re.sub('', title))

def release_key_title_nysiis(re):
    id, title = release_key_title(re)
    return (id, fuzzy.nysiis(title))

def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
    """
    Sort tabular file with sort(1), returns the filename of the sorted file.
    TODO: use separate /fast/tmp for sort.
    """
    with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf:
        env = os.environ.copy()
        if tmpdir is not None:
            env["TMPDIR"] = tmpdir
        if fast:
            env["LC_ALL"] = "C"
        subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env)

    return tf.name

def group_by(filename, key=None, value=None, comment=""):
    """
    Iterate over lines in filename, group by key (a callable deriving the key
    from the line), then apply value callable to emit a minimal document.
    """
    with open(filename) as f:
        for k, g in itertools.groupby(f, key=key):
            doc = {
                "k": k.strip(),
                "v": [value(v) for v in g],
                "c": comment,
            }
            yield doc

def cut(f=0, sep='\t'):
    """
    Return a callable, that extracts a given column from a file with a specific
    separator. TODO: move this into more generic place.
    """
    def f(value):
        parts = value.split(sep)
        if len(parts) > f + 1:
            raise ValueError('cannot split value into {} parts'.format(f))
        return parts[f]
    return f

class Cluster:
    """
    Cluster scaffold for release entities.
    """
    def __init__(self, files="-", output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', tmpdir=None):
        """
        Files can be a list of files or "-" for stdin.
        """
        self.files = files
        self.keyfunc = keyfunc
        self.output = output
        self.prefix = prefix
        self.tmpdir = tmpdir

    def run(self):
        """
        Run clustering and write output to given stream or file.
        """
        keyfunc = self.keyfunc # Save a lookup in loop.
        with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf:
            for line in fileinput.input(files=files):
                try:
                    id, key = keyfunc(json.loads(line))
                    print("{}\t{}".format(id, key), file=tf)
                except (KeyError, ValueError):
                    continue
        sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir)
        for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__):
            json.dump(doc, self.output)

        os.remove(sbc)
        os.remove(tf.name)