tests/test_utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

import pytest

from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
                            token_n_grams, tokenize_string)


def test_slugify_string():
    assert slugify_string("") == ""
    assert slugify_string("X") == "x"
    assert slugify_string("Xx") == "xx"
    assert slugify_string("Xx x") == "xx x"
    assert slugify_string("Xx x  x") == "xx x  x"
    assert slugify_string("Xx?x  x") == "xxx  x"
    assert slugify_string("Xx? ?x  x") == "xx x  x"
    assert slugify_string("Xx?_?x--x") == "xxxx"
    assert slugify_string("=?++*") == ""


def test_cut():
    assert cut()("a	b") == "a"
    assert cut(1)("a	b") == "b"
    assert cut(2, sep=',')("a,b,c") == "c"
    assert cut(3, sep=',')("a,b,c") == ""
    with pytest.raises(ValueError):
        cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""


def test_author_similarity_score():
    assert author_similarity_score("", "") == 0.0
    assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
    assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375


def test_jaccard():
    assert jaccard(set(), set()) == 0
    assert jaccard(set(["a"]), set()) == 0
    assert jaccard(set(["a"]), set(["a"])) == 1.0
    assert jaccard(set(["a", "b"]), set(["a"])) == 0.5
    assert jaccard(set(["a"]), set(["a", "b"])) == 0.5
    assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3


def test_token_n_grams():
    assert token_n_grams("") == []
    assert token_n_grams("a") == ["a"]
    assert token_n_grams("abc") == ["ab", "c"]
    assert token_n_grams("abc", n=3) == ["abc"]
    assert token_n_grams("abc", n=1) == ["a", "b", "c"]
    assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"]


def test_tokenize_string():
    assert tokenize_string("") == []
    assert tokenize_string("a") == ["a"]
    assert tokenize_string("a b") == ["a", "b"]
    assert tokenize_string("a  b  ") == ["a", "b"]
    assert tokenize_string("a b=c") == ["a", "b=c"]
    assert tokenize_string("a b 1999") == ["a", "b", "1999"]
    assert tokenize_string("a?b*1999") == ["a?b*1999"]


def test_nwise():
    assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
    assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
    assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]