tests/test_utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139

import pytest
import os

from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string,
                            token_n_grams, tokenize_string, parse_page_string, dict_has_key,
                            zstdlines, es_compat_hits_total, clean_doi)


def test_slugify_string():
    assert slugify_string("") == ""
    assert slugify_string("X") == "x"
    assert slugify_string("Xx") == "xx"
    assert slugify_string("Xx x") == "xx x"
    assert slugify_string("Xx x  x") == "xx x x"
    assert slugify_string("Xx?x  x") == "xxx x"
    assert slugify_string("Xx? ?x  x") == "xx x x"
    assert slugify_string("Xx?_?x--x") == "xxxx"
    assert slugify_string("=?++*") == ""


def test_cut():
    assert cut()("a	b") == "a"
    assert cut(1)("a	b") == "b"
    assert cut(2, sep=',')("a,b,c") == "c"
    assert cut(3, sep=',')("a,b,c") == ""
    with pytest.raises(ValueError):
        cut(3, sep=',', ignore_missing_column=False)("a,b,c") == ""


def test_author_similarity_score():
    assert author_similarity_score("", "") == 0.0
    assert author_similarity_score("Gregor Samsa", "G. Samsa") == 0.42857142857142855
    assert author_similarity_score("Geronimo Samsa", "G. Samsa") == 0.375


def test_jaccard():
    assert jaccard(set(), set()) == 0
    assert jaccard(set(["a"]), set()) == 0
    assert jaccard(set(["a"]), set(["a"])) == 1.0
    assert jaccard(set(["a", "b"]), set(["a"])) == 0.5
    assert jaccard(set(["a"]), set(["a", "b"])) == 0.5
    assert jaccard(set(["a", "b", "c"]), set(["a", "c"])) == 2 / 3


def test_token_n_grams():
    assert token_n_grams("") == []
    assert token_n_grams("a") == ["a"]
    assert token_n_grams("abc") == ["ab", "c"]
    assert token_n_grams("abc", n=3) == ["abc"]
    assert token_n_grams("abc", n=1) == ["a", "b", "c"]
    assert token_n_grams("abc hello world", n=3) == ["abc", "hel", "lo", "wor", "ld"]


def test_tokenize_string():
    assert tokenize_string("") == []
    assert tokenize_string("a") == ["a"]
    assert tokenize_string("a b") == ["a", "b"]
    assert tokenize_string("a  b  ") == ["a", "b"]
    assert tokenize_string("a b=c") == ["a", "b=c"]
    assert tokenize_string("a b 1999") == ["a", "b", "1999"]
    assert tokenize_string("a?b*1999") == ["a?b*1999"]


def test_nwise():
    assert list(nwise("1234")) == [("1", "2"), ("3", "4")]
    assert list(nwise("1234", n=1)) == [("1", ), ("2", ), ("3", ), ("4", )]
    assert list(nwise([1, 2, 3, 4, 5], n=3)) == [(1, 2, 3), (4, 5)]


def test_dict_has_key():
    assert dict_has_key({}, "") is False
    assert dict_has_key({"a": "a"}, "a") == True
    assert dict_has_key({"a": "a"}, "b") == False
    assert dict_has_key({"a": {"b": "c"}}, "a.b") == True
    assert dict_has_key({"a": {"b": None}}, "a.b") == True
    assert dict_has_key({"a": {"b": "c"}}, "a.b.c") == False


def test_page_page_string():
    reject = ("", "123-2", "123-120", "123a-124", "-2-1", "I-II", "xv-xvi", "p")
    for s in reject:
        with pytest.raises(ValueError):
            assert parse_page_string(s)
    assert parse_page_string("123") == (123, None, None)
    assert parse_page_string("90-90") == (90, 90, 1)
    assert parse_page_string("123-5") == (123, 125, 3)
    assert parse_page_string("123-125") == (123, 125, 3)
    assert parse_page_string("123-124a") == (123, 124, 2)
    assert parse_page_string("1-1000") == (1, 1000, 1000)
    assert parse_page_string("p55") == (55, None, None)
    assert parse_page_string("p55-65") == (55, 65, 11)
    assert parse_page_string("e1234") == (1234, None, None)
    assert parse_page_string("577-89") == (577, 589, 13)


def test_zstdlines():
    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/zstd")
    examples = (
        (os.path.join(test_dir, "lines.txt.zst"), os.path.join(test_dir, "lines.txt")),
        (os.path.join(test_dir, "empty.txt.zst"), os.path.join(test_dir, "empty.txt")),
        (os.path.join(test_dir, "single.txt.zst"), os.path.join(test_dir, "single.txt")),
    )
    for zfn, fn in examples:
        with open(fn) as f:
            assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn))


def test_es_compat_hits_total():
    cases = (
        ({
            "hits": {
                "total": 6
            }
        }, 6),
        ({
            "hits": {
                "total": {
                    "value": 7,
                    "relation": "eq"
                }
            }
        }, 7),
    )
    for r, expected in cases:
        assert es_compat_hits_total(r) == expected


def test_clean_doi():
    assert clean_doi(None) == None
    assert clean_doi("blah") == None
    assert clean_doi("10.1234/asdf ") == "10.1234/asdf"
    assert clean_doi("10.1037//0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
    assert clean_doi("10.1037/0002-9432.72.1.50") == "10.1037/0002-9432.72.1.50"
    assert clean_doi("http://doi.org/10.1234/asdf ") == "10.1234/asdf"
    # GROBID mangled DOI
    assert clean_doi("21924DOI10.1234/asdf ") == "10.1234/asdf"
    assert clean_doi("https://dx.doi.org/10.1234/asdf ") == "10.1234/asdf"
    assert clean_doi("doi:10.1234/asdf ") == "10.1234/asdf"
    assert clean_doi("10.7326/M20-6817") == "10.7326/m20-6817"