fix line reading from bytes

author: Martin Czygan <martin.czygan@gmail.com> 2021-02-02 01:53:11 +0100
committer: Martin Czygan <martin.czygan@gmail.com> 2021-02-02 01:53:11 +0100
commit: 86e5b3527b186bcdde3a47bb950914c45f9f49bd (patch)
tree: 72a9ad1d4f03b4336f79cfd7080c2d0dc5de08a4 /fuzzycat/utils.py
parent: 34de9decb7211d0e83e796049cf64d826b34ec8f (diff)
download: fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.tar.gz
fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.zip
1 files changed, 16 insertions, 3 deletions
diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index b43cbcf..4c961d9 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -183,19 +183,32 @@ def random_idents_from_query(query="*",
     return random.sample(idents, r)
 
 
-def zstdlines(filename):
+def zstdlines(filename, encoding="utf-8", bufsize=65536):
     """
     Generator over lines from a zstd compressed file.
+
+    >>> for line in zstdlines("file.zst"):
+    ...     print(line)
+
     """
     decomp = ZstdDecompressor()
     with open(filename, "rb") as f:
         with decomp.stream_reader(f) as reader:
             prev_line = ""
             while True:
-                chunk = reader.read(65536)
+                chunk = reader.read(bufsize)
                 if not chunk:
                     break
-                string_data = chunk.decode('utf-8')
+                while True:
+                    # We start with bytes but want unicode, which might not
+                    # align; so we jitter around the end to complete the
+                    # codepoint.
+                    try:
+                        string_data = chunk.decode(encoding)
+                    except UnicodeDecodeError:
+                        chunk = chunk + reader.read(1)
+                    else:
+                        break
                 lines = string_data.split("\n")
                 for i, line in enumerate(lines[:-1]):
                     if i == 0:
author	Martin Czygan <martin.czygan@gmail.com>	2021-02-02 01:53:11 +0100
committer	Martin Czygan <martin.czygan@gmail.com>	2021-02-02 01:53:11 +0100
commit	86e5b3527b186bcdde3a47bb950914c45f9f49bd (patch)
tree	72a9ad1d4f03b4336f79cfd7080c2d0dc5de08a4 /fuzzycat/utils.py
parent	34de9decb7211d0e83e796049cf64d826b34ec8f (diff)
download	fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.tar.gz fuzzycat-86e5b3527b186bcdde3a47bb950914c45f9f49bd.zip