aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-02-02 00:40:41 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-02-02 00:40:41 +0100
commitd5ade2a068e2f420b5376f07e13db66c5b43a01e (patch)
treeee9b9fba3e0aa49d0d6254d06e93e2ecdb4603b7 /fuzzycat/utils.py
parent727f44887e0612b54010704dc997fd2ebd8b0344 (diff)
downloadfuzzycat-d5ade2a068e2f420b5376f07e13db66c5b43a01e.tar.gz
fuzzycat-d5ade2a068e2f420b5376f07e13db66c5b43a01e.zip
add compress kwarg to cluster
Will compress intermediate results with zstd (https://git.io/Jt00y9).
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py22
1 files changed, 22 insertions, 0 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index 84db5ec..55729a1 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -8,6 +8,7 @@ import string
import requests
from glom import PathAccessError, glom
+from zstandard import ZstdDecompressor
printable_no_punct = string.digits + string.ascii_letters + string.whitespace
@@ -178,3 +179,24 @@ def random_idents_from_query(query="*",
raise RuntimeError('to few documents')
idents = [doc["_source"]["ident"] for doc in payload["hits"]["hits"]]
return random.sample(idents, r)
+
+
+def zstdlines(filename):
+ """
+ Generator over lines from a zstd compressed file.
+ """
+ decomp = ZstdDecompressor()
+ with open(filename, "rb") as f:
+ with decomp.stream_reader(f) as reader:
+ prev_line = ""
+ while True:
+ chunk = reader.read(65536)
+ if not chunk:
+ break
+ string_data = chunk.decode('utf-8')
+ lines = string_data.split("\n")
+ for i, line in enumerate(lines[:-1]):
+ if i == 0:
+ line = prev_line + line
+ yield line
+ prev_line = lines[-1]