tests/test_utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

import pytest
import os

from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
                            token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
                            zstdlines, es_compat_hits_total)


def test_slugify_string():
    assert slugify_string("") == ""
    assert slugify_string("X") == "x"
    assert slugify_string("Xx") == "xx"
    assert slugify_string("Xx x") == "xx x"
    assert slugify_string("Xx x  x") == "xx x x"
    assert slugify_string("Xx?x  x") == "xxx x"
    assert slugify_string("Xx? ?x  x") == "xx x x"
    assert slugify_string("Xx?_?x--x") == "xxxx"
    assert slugify_string("=?++*") == ""


def test_cut():
    assert cut()("a	b") == "a"
    assert cut(1)("a	b") == "b"
    assert cut(2, sep=',')("a,b,c") == "c"
    assert cut(3, sep=',')("a,b,c") == ""
    with pytest.raises(ValueError):
        cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""


def test_author_similarity_score():
    assert author_similarity_score("", "") == 0.0
    assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
    assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375


def test_jaccard():
    assert jaccard(set(), set()) == 0
    assert jaccard(set(["a"]), set()) == 0
    assert jaccard(set(["a"]), set(["a"])) == 1.0
    assert jaccard(set(["a", "b"]), set(["a"])) == 0.5
    assert jaccard(set(["a"]), set(["a", "b"])) == 0.5
    assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3


def test_token_n_grams():
    assert token_n_grams("") == []
    assert token_n_grams("a") == ["a"]
    assert token_n_grams("abc") == ["ab", "c"]
    assert token_n_grams("abc", n=3) == ["abc"]
    assert token_n_grams("abc", n=1) == ["a", "b", "c"]
    assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"]


def test_tokenize_string():
    assert tokenize_string("") == []
    assert tokenize_string("a") == ["a"]
    assert tokenize_string("a b") == ["a", "b"]
    assert tokenize_string("a  b  ") == ["a", "b"]
    assert tokenize_string("a b=c") == ["a", "b=c"]
    assert tokenize_string("a b 1999") == ["a", "b", "1999"]
    assert tokenize_string("a?b*1999") == ["a?b*1999"]


def test_nwise():
    assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
    assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
    assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]


def test_dict_key_exists():
    assert dict_key_exists({}, "") is False
    assert dict_key_exists({"a": "a"}, "a") == True
    assert dict_key_exists({"a": "a"}, "b") == False
    assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
    assert dict_key_exists({"a": {"b": None}}, "a.b") == True
    assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False


def test_page_page_string():
    reject = ("", "123-2", "123-120", "123a-124", "-2-1")
    for s in reject:
        with pytest.raises(ValueError):
            assert parse_page_string(s)
    assert parse_page_string("123") == (123, 123, 1)
    assert parse_page_string("123-5") == (123, 125, 3)
    assert parse_page_string("123-125") == (123, 125, 3)
    assert parse_page_string("123-124a") == (123, 124, 2)
    assert parse_page_string("1-1000") == (1, 1000, 1000)


def test_zstdlines():
    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/zstd")
    examples = (
        (os.path.join(test_dir, "lines.txt.zst"), os.path.join(test_dir, "lines.txt")),
        (os.path.join(test_dir, "empty.txt.zst"), os.path.join(test_dir, "empty.txt")),
        (os.path.join(test_dir, "single.txt.zst"), os.path.join(test_dir, "single.txt")),
    )
    for zfn, fn in examples:
        with open(fn) as f:
            assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn))


def test_es_compat_hits_total():
    cases = (
        ({
            "hits": {
                "total": 6
            }
        }, 6),
        ({
            "hits": {
                "total": {
                    "value": 7,
                    "relation": "eq"
                }
            }
        }, 7),
    )
    for r, expected in cases:
        assert es_compat_hits_total(r) == expected