aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/utils.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-02-02 01:53:11 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-02-02 01:53:11 +0100
commit86e5b3527b186bcdde3a47bb950914c45f9f49bd (patch)
tree72a9ad1d4f03b4336f79cfd7080c2d0dc5de08a4 /fuzzycat/utils.py
parent34de9decb7211d0e83e796049cf64d826b34ec8f (diff)
downloadfuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.tar.gz
fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.zip
fix line reading from bytes
Diffstat (limited to 'fuzzycat/utils.py')
-rw-r--r--fuzzycat/utils.py19
1 files changed, 16 insertions, 3 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index b43cbcf..4c961d9 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -183,19 +183,32 @@ def random_idents_from_query(query="*",
return random.sample(idents, r)
-def zstdlines(filename):
+def zstdlines(filename, encoding="utf-8", bufsize=65536):
"""
Generator over lines from a zstd compressed file.
+
+ >>> for line in zstdlines("file.zst"):
+ ... print(line)
+
"""
decomp = ZstdDecompressor()
with open(filename, "rb") as f:
with decomp.stream_reader(f) as reader:
prev_line = ""
while True:
- chunk = reader.read(65536)
+ chunk = reader.read(bufsize)
if not chunk:
break
- string_data = chunk.decode('utf-8')
+ while True:
+ # We start with bytes but want unicode, which might not
+ # align; so we jitter around the end to complete the
+ # codepoint.
+ try:
+ string_data = chunk.decode(encoding)
+ except UnicodeDecodeError:
+ chunk = chunk + reader.read(1)
+ else:
+ break
lines = string_data.split("\n")
for i, line in enumerate(lines[:-1]):
if i == 0: