diff options
-rw-r--r-- | fuzzycat/cluster.py | 53 | ||||
-rw-r--r-- | fuzzycat/main.py | 22 |
2 files changed, 52 insertions, 23 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index fbc7712..3282f3b 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -14,18 +14,13 @@ __all__ = [ "release_key_title_normalized", "release_key_title_nysiis", "sort_file_by_column", + "group_by", ] get_ident_title = operator.itemgetter("ident", "title") -ws_replacer = str.maketrans("\t", " ", "\n", " ") +ws_replacer = str.maketrans({"\t": " ", "\n": " "}) non_word_re = re.compile('[\W_]+', re.UNICODE) -def cut(value, f=0, sep='\t'): - """ - Split value by separator and return a single column. - """ - return value.split(sep)[f] - def release_key_title(re): id, title = get_ident_title(re) if not title: @@ -41,13 +36,15 @@ def release_key_title_nysiis(re): id, title = release_key_title(re) return (id, fuzzy.nysiis(title)) -def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-"): +def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None): """ Sort tabular file with sort(1), returns the filename of the sorted file. TODO: use separate /fast/tmp for sort. """ with tempfile.NamedTemporaryFile(delete=False, mode=mode, prefix=prefix) as tf: env = os.environ.copy() + if tmpdir is not None: + env["TMPDIR"] = tmpdir if fast: env["LC_ALL"] = "C" subprocess.run(["sort"] + opts.split() + [filename], stdout=tf, env=env) @@ -55,6 +52,10 @@ def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat- return tf.name def group_by(filename, key=None, value=None, comment=""): + """ + Iterate over lines in filename, group by key (a callable deriving the key + from the line), then apply value callable to emit a minimal document. + """ with open(filename) as f: for k, g in itertools.groupby(f, key=key): doc = { @@ -64,29 +65,45 @@ def group_by(filename, key=None, value=None, comment=""): } yield doc +def cut(f=0, sep='\t'): + """ + Return a callable, that extracts a given column from a file with a specific + separator. TODO: move this into more generic place. + """ + def f(value): + parts = value.split(sep) + if len(parts) > f + 1: + raise ValueError('cannot split value into {} parts'.format(f)) + return parts[f] + return f + class Cluster: """ Cluster scaffold for release entities. """ - def __init__(self, files=None, output=None, keyfunc=lambda v: v, tmp_prefix='fuzzycat-'): + def __init__(self, files="-", output=sys.stdout, keyfunc=lambda v: v, prefix='fuzzycat-', tmpdir=None): + """ + Files can be a list of files or "-" for stdin. + """ self.files = files - self.tmp_prefix = tmp_prefix self.keyfunc = keyfunc self.output = output - if self.output is None: - self.output = sys.stdout + self.prefix = prefix + self.tmpdir = tmpdir def run(self): - with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.tmp_prefix) as tf: + """ + Run clustering and write output to given stream or file. + """ + keyfunc = self.keyfunc # Save a lookup in loop. + with tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=self.prefix) as tf: for line in fileinput.input(files=files): try: - id, key = self.keyfunc(json.loads(line)) + id, key = keyfunc(json.loads(line)) + print("{}\t{}".format(id, key), file=tf) except (KeyError, ValueError): continue - else: - print("{}\t{}".format(id, key), file=tf) - - sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.tmp_prefix) + sbc = sort_by_column(tf.name, opts='-k 2', prefix=self.prefix, tmpdir=self.tmpdir) for doc in group_by(sbc, key=cut(f=1), value=cut(f=0), comment=keyfunc.__name__): json.dump(doc, self.output) diff --git a/fuzzycat/main.py b/fuzzycat/main.py index 8c566d2..71f6f17 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -1,10 +1,22 @@ import argparse -import elasticsearch -import tempfile import sys +import tempfile + +import elasticsearch + +from fuzzycat.cluster import (Cluster, release_key_title, + release_key_title_normalized, + release_key_title_nysiis) + def run_cluster(args): - print('cluster') + types = { + 'title': release_key_title, + 'tnorm': release_key_title_normalized, + 'tnysi': release_key_title_nysiis, + } + cluster = Cluster(files=args.files, keyfunc=types.get(args.type), tmpdir=args.tmpdir, prefix=args.prefix) + def run_verify(args): print('verify') @@ -14,12 +26,13 @@ if __name__ == '__main__': usage='%(prog)s command [options]', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--tmp-prefix', default='fuzzycat-', help='temp file prefix') + parser.add_argument('--prefix', default='fuzzycat-', help='temp file prefix') parser.add_argument('--tmpdir', default=tempfile.gettempdir(), help='temporary directory') subparsers = parser.add_subparsers() sub_cluster = subparsers.add_parser('cluster', help='group entities') sub_cluster.set_defaults(func=run_cluster) + sub_cluster.add_argument('-f', '--files', default="-", help='output files') sub_cluster.add_argument('-t', '--type', default='title', help='cluster algorithm') sub_verify = subparsers.add_parser('verify', help='verify groups') @@ -31,4 +44,3 @@ if __name__ == '__main__': sys.exit(1) args.func(args) - |