diff options
Diffstat (limited to 'fuzzycat/sandcrawler.py')
-rw-r--r-- | fuzzycat/sandcrawler.py | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py index 958756a..63b85e6 100644 --- a/fuzzycat/sandcrawler.py +++ b/fuzzycat/sandcrawler.py @@ -1,6 +1,7 @@ -import regex import unicodedata +import regex + # from http://zderadicka.eu/removing-diacritics-marks-from-strings/ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter AE}': 'AE', @@ -63,6 +64,7 @@ SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile( r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]" ) + def sandcrawler_slugify(raw: str) -> str: """ Python re-implementation of sandcrawler Scala code for string comparison @@ -155,4 +157,3 @@ def test_sandcrawler_slugify() -> None: print(unicodedata.name(c)) print(in_str) assert sandcrawler_slugify(in_str) == out_str - |