1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
import pytest
import os
from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
token_n_grams, tokenize_string, parse_page_string, dict_key_exists,
zstdlines, es_compat_hits_total)
def test_slugify_string():
assert slugify_string("") == ""
assert slugify_string("X") == "x"
assert slugify_string("Xx") == "xx"
assert slugify_string("Xx x") == "xx x"
assert slugify_string("Xx x x") == "xx x x"
assert slugify_string("Xx?x x") == "xxx x"
assert slugify_string("Xx? ?x x") == "xx x x"
assert slugify_string("Xx?_?x--x") == "xxxx"
assert slugify_string("=?++*") == ""
def test_cut():
assert cut()("a b") == "a"
assert cut(1)("a b") == "b"
assert cut(2, sep=',')("a,b,c") == "c"
assert cut(3, sep=',')("a,b,c") == ""
with pytest.raises(ValueError):
cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""
def test_author_similarity_score():
assert author_similarity_score("", "") == 0.0
assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375
def test_jaccard():
assert jaccard(set(), set()) == 0
assert jaccard(set(["a"]), set()) == 0
assert jaccard(set(["a"]), set(["a"])) == 1.0
assert jaccard(set(["a", "b"]), set(["a"])) == 0.5
assert jaccard(set(["a"]), set(["a", "b"])) == 0.5
assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3
def test_token_n_grams():
assert token_n_grams("") == []
assert token_n_grams("a") == ["a"]
assert token_n_grams("abc") == ["ab", "c"]
assert token_n_grams("abc", n=3) == ["abc"]
assert token_n_grams("abc", n=1) == ["a", "b", "c"]
assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"]
def test_tokenize_string():
assert tokenize_string("") == []
assert tokenize_string("a") == ["a"]
assert tokenize_string("a b") == ["a", "b"]
assert tokenize_string("a b ") == ["a", "b"]
assert tokenize_string("a b=c") == ["a", "b=c"]
assert tokenize_string("a b 1999") == ["a", "b", "1999"]
assert tokenize_string("a?b*1999") == ["a?b*1999"]
def test_nwise():
assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]
def test_dict_key_exists():
assert dict_key_exists({}, "") is False
assert dict_key_exists({"a": "a"}, "a") == True
assert dict_key_exists({"a": "a"}, "b") == False
assert dict_key_exists({"a": {"b": "c"}}, "a.b") == True
assert dict_key_exists({"a": {"b": None}}, "a.b") == True
assert dict_key_exists({"a": {"b": "c"}}, "a.b.c") == False
def test_page_page_string():
reject = ("", "123-2", "123-120", "123a-124", "-2-1")
for s in reject:
with pytest.raises(ValueError):
assert parse_page_string(s)
assert parse_page_string("123") == (123, 123, 1)
assert parse_page_string("123-5") == (123, 125, 3)
assert parse_page_string("123-125") == (123, 125, 3)
assert parse_page_string("123-124a") == (123, 124, 2)
assert parse_page_string("1-1000") == (1, 1000, 1000)
def test_zstdlines():
test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/zstd")
examples = (
(os.path.join(test_dir, "lines.txt.zst"), os.path.join(test_dir, "lines.txt")),
(os.path.join(test_dir, "empty.txt.zst"), os.path.join(test_dir, "empty.txt")),
(os.path.join(test_dir, "single.txt.zst"), os.path.join(test_dir, "single.txt")),
)
for zfn, fn in examples:
with open(fn) as f:
assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn))
def test_es_compat_hits_total():
cases = (
({"hits": {"total": 6}}, 6),
({"hits": {"total": {"value": 7, "relation": "eq"}}}, 7),
)
for r, expected in cases:
assert es_compat_hits_total(r) == expected
|