diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 00:40:41 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 00:40:41 +0100 |
commit | d5ade2a068e2f420b5376f07e13db66c5b43a01e (patch) | |
tree | ee9b9fba3e0aa49d0d6254d06e93e2ecdb4603b7 /fuzzycat/utils.py | |
parent | 727f44887e0612b54010704dc997fd2ebd8b0344 (diff) | |
download | fuzzycat-d5ade2a068e2f420b5376f07e13db66c5b43a01e.tar.gz fuzzycat-d5ade2a068e2f420b5376f07e13db66c5b43a01e.zip |
add compress kwarg to cluster
Will compress intermediate results with zstd (https://git.io/Jt00y9).
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r-- | fuzzycat/utils.py | 22 |
1 files changed, 22 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index 84db5ec..55729a1 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -8,6 +8,7 @@ import string import requests from glom import PathAccessError, glom +from zstandard import ZstdDecompressor printable_no_punct = string.digits + string.ascii_letters + string.whitespace @@ -178,3 +179,24 @@ def random_idents_from_query(query="*", raise RuntimeError('to few documents') idents = [doc["_source"]["ident"] for doc in payload["hits"]["hits"]] return random.sample(idents, r) + + +def zstdlines(filename): + """ + Generator over lines from a zstd compressed file. + """ + decomp = ZstdDecompressor() + with open(filename, "rb") as f: + with decomp.stream_reader(f) as reader: + prev_line = "" + while True: + chunk = reader.read(65536) + if not chunk: + break + string_data = chunk.decode('utf-8') + lines = string_data.split("\n") + for i, line in enumerate(lines[:-1]): + if i == 0: + line = prev_line + line + yield line + prev_line = lines[-1] |