From 86e5b3527b186bcdde3a47bb950914c45f9f49bd Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 2 Feb 2021 01:53:11 +0100 Subject: fix line reading from bytes --- fuzzycat/utils.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index b43cbcf..4c961d9 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -183,19 +183,32 @@ def random_idents_from_query(query="*", return random.sample(idents, r) -def zstdlines(filename): +def zstdlines(filename, encoding="utf-8", bufsize=65536): """ Generator over lines from a zstd compressed file. + + >>> for line in zstdlines("file.zst"): + ... print(line) + """ decomp = ZstdDecompressor() with open(filename, "rb") as f: with decomp.stream_reader(f) as reader: prev_line = "" while True: - chunk = reader.read(65536) + chunk = reader.read(bufsize) if not chunk: break - string_data = chunk.decode('utf-8') + while True: + # We start with bytes but want unicode, which might not + # align; so we jitter around the end to complete the + # codepoint. + try: + string_data = chunk.decode(encoding) + except UnicodeDecodeError: + chunk = chunk + reader.read(1) + else: + break lines = string_data.split("\n") for i, line in enumerate(lines[:-1]): if i == 0: -- cgit v1.2.3