diff options
| -rw-r--r-- | fuzzycat/cluster.py | 8 | ||||
| -rw-r--r-- | fuzzycat/main.py | 2 | ||||
| -rw-r--r-- | tests/test_cluster.py | 138 | 
3 files changed, 125 insertions, 23 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index a40db06..b2f739f 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -6,13 +6,13 @@ Clustering stage.  import fileinput  import itertools  import json +import logging  import operator  import os  import re  import subprocess  import sys  import tempfile -import logging  import fuzzy @@ -29,6 +29,7 @@ get_ident_title = operator.itemgetter("ident", "title")  ws_replacer = str.maketrans({"\t": " ", "\n": " "})  non_word_re = re.compile(r'[\W_]+', re.UNICODE) +  def release_key_title(release_entity):      id, title = get_ident_title(release_entity)      if not title: @@ -48,6 +49,7 @@ def release_key_title_nysiis(release_entity):      id, title = release_key_title(release_entity)      return (id, fuzzy.nysiis(title)) +  def release_key_title_authors_ngram(release_entity):      """      Derive a key from title and authors. @@ -55,7 +57,6 @@ def release_key_title_authors_ngram(release_entity):      # SS: compare ngram sets? -  def sort_by_column(filename, opts="-k 2", fast=True, mode="w", prefix="fuzzycat-", tmpdir=None):      """      Sort tabular file with sort(1), returns the filename of the sorted file. @@ -81,8 +82,9 @@ def group_by(seq, key=None, value=None, comment=""):          doc = {              "k": k.strip(),              "v": [value(v) for v in g], -            "c": comment,          } +        if comment: +            doc["c"] = comment          yield doc diff --git a/fuzzycat/main.py b/fuzzycat/main.py index 44e0857..5f9efc3 100644 --- a/fuzzycat/main.py +++ b/fuzzycat/main.py @@ -11,9 +11,9 @@ Run, e.g. fuzzycat cluster --help for more options. Example:  """  import argparse +import logging  import sys  import tempfile -import logging  from fuzzycat.cluster import (Cluster, release_key_title, release_key_title_normalized,                                release_key_title_nysiis) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index aa015a6..7439e15 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -1,9 +1,12 @@  import pytest  import collections -from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis +import tempfile +from fuzzycat.cluster import release_key_title, release_key_title_normalized, release_key_title_nysiis, sort_by_column, group_by +import os  Case = collections.namedtuple("Case", 'input output') +  def test_release_key_title():      with pytest.raises(KeyError):          release_key_title({}) @@ -14,9 +17,18 @@ def test_release_key_title():      with pytest.raises(ValueError, match='title.*missing'):          release_key_title({'ident': '', 'title': ''})      cases = ( -        Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')), -        Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'Simhash')), -        Case(input={'ident': '', 'title': 'Sim  hash'}, output=('', 'Sim  hash')), +        Case(input={ +            'ident': '', +            'title': 'simhash' +        }, output=('', 'simhash')), +        Case(input={ +            'ident': '', +            'title': 'Simhash' +        }, output=('', 'Simhash')), +        Case(input={ +            'ident': '', +            'title': 'Sim  hash' +        }, output=('', 'Sim  hash')),      )      for case in cases:          assert case.output == release_key_title(case.input) @@ -24,25 +36,113 @@ def test_release_key_title():  def test_release_key_title_normalized():      cases = ( -        Case(input={'ident': '', 'title': 'simhash'}, output=('', 'simhash')), -        Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'simhash')), -        Case(input={'ident': '', 'title': 'Sim  hash'}, output=('', 'simhash')), -        Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'theyear1929')), -        Case(input={'ident': '', 'title': '2019?'}, output=('', '2019')), -        Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'h2019')), +        Case(input={ +            'ident': '', +            'title': 'simhash' +        }, output=('', 'simhash')), +        Case(input={ +            'ident': '', +            'title': 'Simhash' +        }, output=('', 'simhash')), +        Case(input={ +            'ident': '', +            'title': 'Sim  hash' +        }, output=('', 'simhash')), +        Case(input={ +            'ident': '', +            'title': 'THE year 1929' +        }, output=('', 'theyear1929')), +        Case(input={ +            'ident': '', +            'title': '2019?' +        }, output=('', '2019')), +        Case(input={ +            'ident': '123', +            'title': 'H~~2019?' +        }, output=('123', 'h2019')),      )      for case in cases: -        assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format(case.input) +        assert case.output == release_key_title_normalized(case.input), 'failed case {}'.format( +            case.input) +  def test_release_key_title_nysiis():      cases = ( -        Case(input={'ident': '', 'title': 'simhash'}, output=('', 'SANAS')), -        Case(input={'ident': '', 'title': 'Simhash'}, output=('', 'SANAS')), -        Case(input={'ident': '', 'title': 'Sim  hash'}, output=('', 'SANAS')), -        Case(input={'ident': '', 'title': 'THE year 1929'}, output=('', 'TAR')), -        Case(input={'ident': '', 'title': '2019?'}, output=('', '')), -        Case(input={'ident': '123', 'title': 'H~~2019?'}, output=('123', 'H')), -        Case(input={'ident': '123', 'title': '世界'}, output=('123', '')), +        Case(input={ +            'ident': '', +            'title': 'simhash' +        }, output=('', 'SANAS')), +        Case(input={ +            'ident': '', +            'title': 'Simhash' +        }, output=('', 'SANAS')), +        Case(input={ +            'ident': '', +            'title': 'Sim  hash' +        }, output=('', 'SANAS')), +        Case(input={ +            'ident': '', +            'title': 'THE year 1929' +        }, output=('', 'TAR')), +        Case(input={ +            'ident': '', +            'title': '2019?' +        }, output=('', '')), +        Case(input={ +            'ident': '123', +            'title': 'H~~2019?' +        }, output=('123', 'H')), +        Case(input={ +            'ident': '123', +            'title': '世界' +        }, output=('123', '')),      )      for case in cases: -        assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format(case.input) +        assert case.output == release_key_title_nysiis(case.input), 'failed case {}'.format( +            case.input) + + +def test_sort_by_column(): +    with tempfile.NamedTemporaryFile(delete=False, mode="w") as tf: +        tf.write("2 b\n") +        tf.write("2 a\n") +        tf.write("9 d\n") +        tf.write("1 c\n") + +    fn = sort_by_column(tf.name, opts='-k 2') +    with open(fn) as f: +        lines = [v.strip() for v in f] +        assert lines == ['2 a', '2 b', '1 c', '9 d'] + +    fn = sort_by_column(tf.name, opts='-k 1') +    with open(fn) as f: +        lines = [v.strip() for v in f] +        assert lines == ['1 c', '2 a', '2 b', '9 d'] + +    fn = sort_by_column(tf.name, opts='-k 3') +    with open(fn) as f: +        lines = [v.strip() for v in f] +        assert lines == ['1 c', '2 a', '2 b', '9 d'] + + +def test_group_by(): +    Case = collections.namedtuple("Case", "seq keyfunc valuefunc result") +    cases = ( +        Case(["0", "1"], lambda v: v, lambda v: v, [{ +            'k': '0', +            'v': ['0'] +        }, { +            'k': '1', +            'v': ['1'] +        }]), +        Case(["a 1", "a 2", "b 3"], lambda v: v.split()[0], lambda v: v.split()[1], [{ +            'k': 'a', +            'v': ['1', '2'] +        }, { +            'k': 'b', +            'v': ['3'] +        }]), +    ) + +    for case in cases: +        assert case.result == list(group_by(case.seq, case.keyfunc, case.valuefunc))  | 
