3 files changed, 125 insertions, 23 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index a40db06..b2f739f 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -6,13 +6,13 @@ Clustering stage.
 import fileinput
 import itertools
 import json
+import logging
 import operator
 import os
 import re
 import subprocess
 import sys
 import tempfile
-import logging
 
 import fuzzy
 
@@ -29,6 +29,7 @@ get_ident_title = operator.itemgetter("ident", "title")
 ws_replacer = str.maketrans({"\t": " ", "\n": " "})
 non_word_re = re.compile(r'[\W_]+', re.UNICODE)
 
+
 def release_key_title(release_entity):
     id, title = get_ident_title(release_entity)
     if not title:
@@ -48,6 +49,7 @@ def release_key_title_nysiis(release_entity):
     id, title = release_key_title(release_entity)
     return (id, fuzzy.nysiis(title))
 
+
 def release_key_title_authors_ngram(release_entity):
     """
     Derive a key from title and authors.
@@ -55,7 +57,6 @@ def release_key_title_authors_ngram(release_entity):
     # SS: compare ngram sets?
 
 
-
 def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):
     """
     Sort tabular file with sort(1), returns the filename of the sorted file.
@@ -81,8 +82,9 @@ def group_by(seq, key=None, value=None, comment=""):
         doc = {
             "k": k.strip(),
             "v": [value(v) for v in g],
-            "c": comment,
         }
+        if comment:
+            doc["c"] = comment
         yield doc
 
 
diff --git a/fuzzycat/main.py b/fuzzycat/main.py
index 44e0857..5f9efc3 100644
--- a/fuzzycat/main.py
+++ b/fuzzycat/main.py
@@ -11,9 +11,9 @@ Run, e.g. fuzzycat cluster --help for more options. Example:
 """
 
 import argparse
+import logging
 import sys
 import tempfile
-import logging
 
 from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,
                               release_key_title_nysiis)
diff --git a/tests/test_cluster.py b/tests/test_cluster.py
index aa015a6..7439e15 100644
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@@ -1,9 +1,12 @@
 import pytest
 import collections
-from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis
+import tempfile
+from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by
+import os
 
 Case = collections.namedtuple("Case", 'input output')
 
+
 def test_release_key_title():
     with pytest.raises(KeyError):
         release_key_title({})
@@ -14,9 +17,18 @@ def test_release_key_title():
     with pytest.raises(ValueError, match='title.*missing'):
         release_key_title({'ident': '', 'title': ''})
     cases = (
-        Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')),
-        Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'Simhash')),
-        Case(input={'ident': '', 'title': 'Sim  hash'}, output=('', 'Sim  hash')),
+        Case(input={
+            'ident': '',
+            'title': 'simhash'
+        }, output=('', 'simhash')),
+        Case(input={
+            'ident': '',
+            'title': 'Simhash'
+        }, output=('', 'Simhash')),
+        Case(input={
+            'ident': '',
+            'title': 'Sim  hash'
+        }, output=('', 'Sim  hash')),
     )
     for case in cases:
         assert case.output == release_key_title(case.input)
@@ -24,25 +36,113 @@ def test_release_key_title():
 
 def test_release_key_title_normalized():
     cases = (
-        Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')),
-        Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'simhash')),
-        Case(input={'ident': '', 'title': 'Sim  hash'}, output=('', 'simhash')),
-        Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'theyear1929')),
-        Case(input={'ident': '', 'title': '2019?'}, output=('', '2019')),
-        Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'h2019')),
+        Case(input={
+            'ident': '',
+            'title': 'simhash'
+        }, output=('', 'simhash')),
+        Case(input={
+            'ident': '',
+            'title': 'Simhash'
+        }, output=('', 'simhash')),
+        Case(input={
+            'ident': '',
+            'title': 'Sim  hash'
+        }, output=('', 'simhash')),
+        Case(input={
+            'ident': '',
+            'title': 'THE year 1929'
+        }, output=('', 'theyear1929')),
+        Case(input={
+            'ident': '',
+            'title': '2019?'
+        }, output=('', '2019')),
+        Case(input={
+            'ident': '123',
+            'title': 'H~~2019?'
+        }, output=('123', 'h2019')),
     )
     for case in cases:
-        assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(case.input)
+        assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(
+            case.input)
+
 
 def test_release_key_title_nysiis():
     cases = (
-        Case(input={'ident': '', 'title': 'simhash'}, output=('', 'SANAS')),
-        Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'SANAS')),
-        Case(input={'ident': '', 'title': 'Sim  hash'}, output=('', 'SANAS')),
-        Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'TAR')),
-        Case(input={'ident': '', 'title': '2019?'}, output=('', '')),
-        Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'H')),
-        Case(input={'ident': '123', 'title': '世界'}, output=('123', '')),
+        Case(input={
+            'ident': '',
+            'title': 'simhash'
+        }, output=('', 'SANAS')),
+        Case(input={
+            'ident': '',
+            'title': 'Simhash'
+        }, output=('', 'SANAS')),
+        Case(input={
+            'ident': '',
+            'title': 'Sim  hash'
+        }, output=('', 'SANAS')),
+        Case(input={
+            'ident': '',
+            'title': 'THE year 1929'
+        }, output=('', 'TAR')),
+        Case(input={
+            'ident': '',
+            'title': '2019?'
+        }, output=('', '')),
+        Case(input={
+            'ident': '123',
+            'title': 'H~~2019?'
+        }, output=('123', 'H')),
+        Case(input={
+            'ident': '123',
+            'title': '世界'
+        }, output=('123', '')),
     )
     for case in cases:
-        assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(case.input)
+        assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(
+            case.input)
+
+
+def test_sort_by_column():
+    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf:
+        tf.write("2 b\n")
+        tf.write("2 a\n")
+        tf.write("9 d\n")
+        tf.write("1 c\n")
+
+    fn = sort_by_column(tf.name, opts='-k 2')
+    with open(fn) as f:
+        lines = [v.strip() for v in f]
+        assert lines == ['2 a', '2 b', '1 c', '9 d']
+
+    fn = sort_by_column(tf.name, opts='-k 1')
+    with open(fn) as f:
+        lines = [v.strip() for v in f]
+        assert lines == ['1 c', '2 a', '2 b', '9 d']
+
+    fn = sort_by_column(tf.name, opts='-k 3')
+    with open(fn) as f:
+        lines = [v.strip() for v in f]
+        assert lines == ['1 c', '2 a', '2 b', '9 d']
+
+
+def test_group_by():
+    Case = collections.namedtuple("Case", "seq keyfunc valuefunc result")
+    cases = (
+        Case(["0", "1"], lambda v: v, lambda v: v, [{
+            'k': '0',
+            'v': ['0']
+        }, {
+            'k': '1',
+            'v': ['1']
+        }]),
+        Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{
+            'k': 'a',
+            'v': ['1', '2']
+        }, {
+            'k': 'b',
+            'v': ['3']
+        }]),
+    )
+
+    for case in cases:
+        assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc))