aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fuzzycat/cluster.py4
1 files changed, 4 insertions, 0 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index fa54ec6..4ccf8a9 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -11,6 +11,7 @@ import logging
import operator
import os
import re
+import string
import subprocess
import sys
import tempfile
@@ -54,12 +55,14 @@ ws_replacer = str.maketrans({"\t": " ", "\n": " "})
non_word_re = re.compile(r'[\W_]+', re.UNICODE)
printable_no_punct = string.digits + string.letters + string.whitespace
+
def slugify_string(s: str) -> str:
"""
Keeps ascii chars and single whitespace only.
"""
return ''.join((c for c in s.lower() if c in printable_no_punct))
+
# Notes: untie from release_entity, as we are only using a few fields. Maybe
# it's a jsob blob, with a pydantic spec and schema.
@@ -107,6 +110,7 @@ def release_key_title_ngram(doc: KeyDoc, n=3) -> Tuple[str, str]:
key = ''.join(tokens[:3] + tokens[-3:])
return (ident, key)
+
def sort_by_column(filename: str,
opts: str = "-k 2",
fast: bool = True,