diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 00:40:41 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 00:40:41 +0100 |
commit | d5ade2a068e2f420b5376f07e13db66c5b43a01e (patch) | |
tree | ee9b9fba3e0aa49d0d6254d06e93e2ecdb4603b7 /tests | |
parent | 727f44887e0612b54010704dc997fd2ebd8b0344 (diff) | |
download | fuzzycat-d5ade2a068e2f420b5376f07e13db66c5b43a01e.tar.gz fuzzycat-d5ade2a068e2f420b5376f07e13db66c5b43a01e.zip |
add compress kwarg to cluster
Will compress intermediate results with zstd (https://git.io/Jt00y9).
Diffstat (limited to 'tests')
-rw-r--r-- | tests/data/zstd/empty.txt | 0 | ||||
-rw-r--r-- | tests/data/zstd/empty.txt.zst | bin | 0 -> 13 bytes | |||
-rw-r--r-- | tests/data/zstd/lines.txt | 9 | ||||
-rw-r--r-- | tests/data/zstd/lines.txt.zst | bin | 0 -> 31 bytes | |||
-rw-r--r-- | tests/data/zstd/single.txt | 1 | ||||
-rw-r--r-- | tests/data/zstd/single.txt.zst | bin | 0 -> 18 bytes | |||
-rw-r--r-- | tests/test_cluster.py | 13 | ||||
-rw-r--r-- | tests/test_utils.py | 16 |
8 files changed, 31 insertions, 8 deletions
diff --git a/tests/data/zstd/empty.txt b/tests/data/zstd/empty.txt new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tests/data/zstd/empty.txt diff --git a/tests/data/zstd/empty.txt.zst b/tests/data/zstd/empty.txt.zst Binary files differnew file mode 100644 index 0000000..e58c09d --- /dev/null +++ b/tests/data/zstd/empty.txt.zst diff --git a/tests/data/zstd/lines.txt b/tests/data/zstd/lines.txt new file mode 100644 index 0000000..0719398 --- /dev/null +++ b/tests/data/zstd/lines.txt @@ -0,0 +1,9 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/data/zstd/lines.txt.zst b/tests/data/zstd/lines.txt.zst Binary files differnew file mode 100644 index 0000000..bc9be49 --- /dev/null +++ b/tests/data/zstd/lines.txt.zst diff --git a/tests/data/zstd/single.txt b/tests/data/zstd/single.txt new file mode 100644 index 0000000..4b37d57 --- /dev/null +++ b/tests/data/zstd/single.txt @@ -0,0 +1 @@ +zzzz diff --git a/tests/data/zstd/single.txt.zst b/tests/data/zstd/single.txt.zst Binary files differnew file mode 100644 index 0000000..47e377f --- /dev/null +++ b/tests/data/zstd/single.txt.zst diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 3ad32a7..793798b 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -109,20 +109,19 @@ def test_release_key_title_nysiis(): def test_cluster(): sio = io.StringIO() - cluster = Cluster([ - json.dumps(line) for line in [ + lines = [ + json.dumps(doc) for doc in [ { "title": "hello world", - "ident": 1 + "ident": 1, }, { "title": "hello world!", - "ident": 2 + "ident": 2, }, ] - ], - release_key_title_normalized, - output=sio) + ] + cluster = Cluster(lines, release_key_title_normalized, output=sio) stats = cluster.run() assert stats == { "key_fail": 0, diff --git a/tests/test_utils.py b/tests/test_utils.py index fa930fe..29b125b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ import pytest +import os from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, - token_n_grams, tokenize_string, parse_page_string, dict_key_exists) + token_n_grams, tokenize_string, parse_page_string, dict_key_exists, + zstdlines) def test_slugify_string(): @@ -84,3 +86,15 @@ def test_page_page_string(): assert parse_page_string("123-125") == (123, 125, 3) assert parse_page_string("123-124a") == (123, 124, 2) assert parse_page_string("1-1000") == (1, 1000, 1000) + + +def test_zstdlines(): + test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/zstd") + examples = ( + (os.path.join(test_dir, "lines.txt.zst"), os.path.join(test_dir, "lines.txt")), + (os.path.join(test_dir, "empty.txt.zst"), os.path.join(test_dir, "empty.txt")), + (os.path.join(test_dir, "single.txt.zst"), os.path.join(test_dir, "single.txt")), + ) + for zfn, fn in examples: + with open(fn) as f: + assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn)) |