diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 01:53:11 +0100 | 
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-02-02 01:53:11 +0100 | 
| commit | 86e5b3527b186bcdde3a47bb950914c45f9f49bd (patch) | |
| tree | 72a9ad1d4f03b4336f79cfd7080c2d0dc5de08a4 | |
| parent | 34de9decb7211d0e83e796049cf64d826b34ec8f (diff) | |
| download | fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.tar.gz fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.zip  | |
fix line reading from bytes
| -rw-r--r-- | fuzzycat/utils.py | 19 | 
1 files changed, 16 insertions, 3 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py index b43cbcf..4c961d9 100644 --- a/fuzzycat/utils.py +++ b/fuzzycat/utils.py @@ -183,19 +183,32 @@ def random_idents_from_query(query="*",      return random.sample(idents, r) -def zstdlines(filename): +def zstdlines(filename, encoding="utf-8", bufsize=65536):      """      Generator over lines from a zstd compressed file. + +    >>> for line in zstdlines("file.zst"): +    ...     print(line) +      """      decomp = ZstdDecompressor()      with open(filename, "rb") as f:          with decomp.stream_reader(f) as reader:              prev_line = ""              while True: -                chunk = reader.read(65536) +                chunk = reader.read(bufsize)                  if not chunk:                      break -                string_data = chunk.decode('utf-8') +                while True: +                    # We start with bytes but want unicode, which might not +                    # align; so we jitter around the end to complete the +                    # codepoint. +                    try: +                        string_data = chunk.decode(encoding) +                    except UnicodeDecodeError: +                        chunk = chunk + reader.read(1) +                    else: +                        break                  lines = string_data.split("\n")                  for i, line in enumerate(lines[:-1]):                      if i == 0:  | 
