diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 01:53:11 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 01:53:11 +0100 |
commit | 86e5b3527b186bcdde3a47bb950914c45f9f49bd (patch) | |
tree | 72a9ad1d4f03b4336f79cfd7080c2d0dc5de08a4 | |
parent | 34de9decb7211d0e83e796049cf64d826b34ec8f (diff) | |
download | fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.tar.gz fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.zip |
fix line reading from bytes
-rw-r--r-- | fuzzycat/utils.py | 19 |
1 files changed, 16 insertions, 3 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index b43cbcf..4c961d9 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -183,19 +183,32 @@ def random_idents_from_query(query="*", return random.sample(idents, r) -def zstdlines(filename): +def zstdlines(filename, encoding="utf-8", bufsize=65536): """ Generator over lines from a zstd compressed file. + + >>> for line in zstdlines("file.zst"): + ... print(line) + """ decomp = ZstdDecompressor() with open(filename, "rb") as f: with decomp.stream_reader(f) as reader: prev_line = "" while True: - chunk = reader.read(65536) + chunk = reader.read(bufsize) if not chunk: break - string_data = chunk.decode('utf-8') + while True: + # We start with bytes but want unicode, which might not + # align; so we jitter around the end to complete the + # codepoint. + try: + string_data = chunk.decode(encoding) + except UnicodeDecodeError: + chunk = chunk + reader.read(1) + else: + break lines = string_data.split("\n") for i, line in enumerate(lines[:-1]): if i == 0: |