From d5ade2a068e2f420b5376f07e13db66c5b43a01e Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 2 Feb 2021 00:40:41 +0100 Subject: add compress kwarg to cluster Will compress intermediate results with zstd (https://git.io/Jt00y9). --- tests/data/zstd/empty.txt | 0 tests/data/zstd/empty.txt.zst | Bin 0 -> 13 bytes tests/data/zstd/lines.txt | 9 +++++++++ tests/data/zstd/lines.txt.zst | Bin 0 -> 31 bytes tests/data/zstd/single.txt | 1 + tests/data/zstd/single.txt.zst | Bin 0 -> 18 bytes tests/test_cluster.py | 13 ++++++------- tests/test_utils.py | 16 +++++++++++++++- 8 files changed, 31 insertions(+), 8 deletions(-) create mode 100644 tests/data/zstd/empty.txt create mode 100644 tests/data/zstd/empty.txt.zst create mode 100644 tests/data/zstd/lines.txt create mode 100644 tests/data/zstd/lines.txt.zst create mode 100644 tests/data/zstd/single.txt create mode 100644 tests/data/zstd/single.txt.zst (limited to 'tests') diff --git a/tests/data/zstd/empty.txt b/tests/data/zstd/empty.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/data/zstd/empty.txt.zst b/tests/data/zstd/empty.txt.zst new file mode 100644 index 0000000..e58c09d Binary files /dev/null and b/tests/data/zstd/empty.txt.zst differ diff --git a/tests/data/zstd/lines.txt b/tests/data/zstd/lines.txt new file mode 100644 index 0000000..0719398 --- /dev/null +++ b/tests/data/zstd/lines.txt @@ -0,0 +1,9 @@ +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/data/zstd/lines.txt.zst b/tests/data/zstd/lines.txt.zst new file mode 100644 index 0000000..bc9be49 Binary files /dev/null and b/tests/data/zstd/lines.txt.zst differ diff --git a/tests/data/zstd/single.txt b/tests/data/zstd/single.txt new file mode 100644 index 0000000..4b37d57 --- /dev/null +++ b/tests/data/zstd/single.txt @@ -0,0 +1 @@ +zzzz diff --git a/tests/data/zstd/single.txt.zst b/tests/data/zstd/single.txt.zst new file mode 100644 index 0000000..47e377f Binary files /dev/null and b/tests/data/zstd/single.txt.zst differ diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 3ad32a7..793798b 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -109,20 +109,19 @@ def test_release_key_title_nysiis(): def test_cluster(): sio = io.StringIO() - cluster = Cluster([ - json.dumps(line) for line in [ + lines = [ + json.dumps(doc) for doc in [ { "title": "hello world", - "ident": 1 + "ident": 1, }, { "title": "hello world!", - "ident": 2 + "ident": 2, }, ] - ], - release_key_title_normalized, - output=sio) + ] + cluster = Cluster(lines, release_key_title_normalized, output=sio) stats = cluster.run() assert stats == { "key_fail": 0, diff --git a/tests/test_utils.py b/tests/test_utils.py index fa930fe..29b125b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,9 @@ import pytest +import os from fuzzycat.utils import (author_similarity_score, cut, jaccard, nwise, slugify_string, - token_n_grams, tokenize_string, parse_page_string, dict_key_exists) + token_n_grams, tokenize_string, parse_page_string, dict_key_exists, + zstdlines) def test_slugify_string(): @@ -84,3 +86,15 @@ def test_page_page_string(): assert parse_page_string("123-125") == (123, 125, 3) assert parse_page_string("123-124a") == (123, 124, 2) assert parse_page_string("1-1000") == (1, 1000, 1000) + + +def test_zstdlines(): + test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data/zstd") + examples = ( + (os.path.join(test_dir, "lines.txt.zst"), os.path.join(test_dir, "lines.txt")), + (os.path.join(test_dir, "empty.txt.zst"), os.path.join(test_dir, "empty.txt")), + (os.path.join(test_dir, "single.txt.zst"), os.path.join(test_dir, "single.txt")), + ) + for zfn, fn in examples: + with open(fn) as f: + assert [s.strip() for s in f.readlines()] == list(zstdlines(zfn)) -- cgit v1.2.3