diff options
author | Martin Czygan <martin@archive.org> | 2021-07-02 16:18:51 +0000 |
---|---|---|
committer | Martin Czygan <martin@archive.org> | 2021-07-02 16:18:51 +0000 |
commit | a7b35c48612427d763c1db348b7a7f4083a0410b (patch) | |
tree | 1e2148cdeaef8283f174621327c4d000d171f277 /fuzzycat/cluster.py | |
parent | 0d5535742786fe78f6509b6606ca381912ed8bc7 (diff) | |
parent | 7b900c757a4306be0aeb90c1b13ccb9fa266e621 (diff) | |
download | fuzzycat-a7b35c48612427d763c1db348b7a7f4083a0410b.tar.gz fuzzycat-a7b35c48612427d763c1db348b7a7f4083a0410b.zip |
Merge branch 'bnewbold-verify-improvements' into 'master'
verify improvements
See merge request webgroup/fuzzycat!4
Diffstat (limited to 'fuzzycat/cluster.py')
-rw-r--r-- | fuzzycat/cluster.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 4e70bdd..c8384c1 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -151,10 +151,20 @@ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter T with stroke}': 'T', '\N{Latin small letter t with stroke}': 't', - # bnewbold additions + # bnewbold additions; mostly Latin-ish OCR ambiguous '\N{MICRO SIGN}': 'u', '\N{LATIN SMALL LETTER C}': 'c', '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', + '\N{Greek Small Letter Alpha}': 'a', + '\N{Greek Small Letter Beta}': 'b', + '\N{Greek Small Letter Iota}': 'i', + '\N{Greek Small Letter Kappa}': 'k', + '\N{Greek Small Letter Chi}': 'x', + '\N{Greek Small Letter Upsilon}': 'u', + '\N{Greek Small Letter Nu}': 'v', + '\N{Greek Small Letter Gamma}': 'y', + '\N{Greek Small Letter Tau}': 't', + '\N{Greek Small Letter Omicron}': 'o', # bnewbold map-to-null (for non-printing stuff not in the regex) '\N{PARTIAL DIFFERENTIAL}': '', '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', @@ -193,7 +203,7 @@ def sandcrawler_slugify(raw: str) -> str: slug = slug.replace("'", "'") # iterate over all chars and replace from map, if in map; then lower-case again - slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]) + slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower() # early bailout before executing regex if not slug: @@ -217,6 +227,7 @@ def test_sandcrawler_slugify() -> None: ("علمية", "علمية"), ("期刊的数字", "期刊的数字"), ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("γ-Globulin", "yglobulin"), # "MICRO SIGN" ("\xb5meter", "umeter"), |