aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/sandcrawler.py
diff options
context:
space:
mode:
authorMartin Czygan <martin@archive.org>2021-11-16 19:06:26 +0000
committerMartin Czygan <martin@archive.org>2021-11-16 19:06:26 +0000
commit24dcddc4e4cff744e7c0a964856329d2ac69601d (patch)
treead8650892805e55ec4a6958f9e1539eb675332b8 /fuzzycat/sandcrawler.py
parent282f315c6ba3643c8c614220ab2f7e1d55de3658 (diff)
parent409392d66c3a6debe5bc69c0e2308209ac74ee35 (diff)
downloadfuzzycat-24dcddc4e4cff744e7c0a964856329d2ac69601d.tar.gz
fuzzycat-24dcddc4e4cff744e7c0a964856329d2ac69601d.zip
Merge branch 'martin-matcher-class' into 'master'
turn "match_release_fuzzy" into a class See merge request webgroup/fuzzycat!10
Diffstat (limited to 'fuzzycat/sandcrawler.py')
-rw-r--r--fuzzycat/sandcrawler.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/fuzzycat/sandcrawler.py b/fuzzycat/sandcrawler.py
index 958756a..63b85e6 100644
--- a/fuzzycat/sandcrawler.py
+++ b/fuzzycat/sandcrawler.py
@@ -1,6 +1,7 @@
-import regex
import unicodedata
+import regex
+
# from http://zderadicka.eu/removing-diacritics-marks-from-strings/
SANDCRAWLER_CHAR_MAP = {
'\N{Latin capital letter AE}': 'AE',
@@ -63,6 +64,7 @@ SANDCRAWLER_REMOVE_CHAR_REGEX = regex.compile(
r"[\s\p{Punctuation}\p{M}\p{InCombiningDiacriticalMarks}\u2000-\u206F\u2E00-\u2E7F’·“”‘’“”«»「」¿–±§_`°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]"
)
+
def sandcrawler_slugify(raw: str) -> str:
"""
Python re-implementation of sandcrawler Scala code for string comparison
@@ -155,4 +157,3 @@ def test_sandcrawler_slugify() -> None:
print(unicodedata.name(c))
print(in_str)
assert sandcrawler_slugify(in_str) == out_str
-