From 86e5b3527b186bcdde3a47bb950914c45f9f49bd Mon Sep 17 00:00:00 2001
From: Martin Czygan <martin.czygan@gmail.com>
Date: Tue, 2 Feb 2021 01:53:11 +0100
Subject: fix line reading from bytes

---
 fuzzycat/utils.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fuzzycat/utils.py b/fuzzycat/utils.py
index b43cbcf..4c961d9 100644
--- a/fuzzycat/utils.py
+++ b/fuzzycat/utils.py
@@ -183,19 +183,32 @@ def random_idents_from_query(query="*",
     return random.sample(idents, r)
 
 
-def zstdlines(filename):
+def zstdlines(filename, encoding="utf-8", bufsize=65536):
     """
     Generator over lines from a zstd compressed file.
+
+    >>> for line in zstdlines("file.zst"):
+    ...     print(line)
+
     """
     decomp = ZstdDecompressor()
     with open(filename, "rb") as f:
         with decomp.stream_reader(f) as reader:
             prev_line = ""
             while True:
-                chunk = reader.read(65536)
+                chunk = reader.read(bufsize)
                 if not chunk:
                     break
-                string_data = chunk.decode('utf-8')
+                while True:
+                    # We start with bytes but want unicode, which might not
+                    # align; so we jitter around the end to complete the
+                    # codepoint.
+                    try:
+                        string_data = chunk.decode(encoding)
+                    except UnicodeDecodeError:
+                        chunk = chunk + reader.read(1)
+                    else:
+                        break
                 lines = string_data.split("\n")
                 for i, line in enumerate(lines[:-1]):
                     if i == 0:
-- 
cgit v1.2.3