From 7b900c757a4306be0aeb90c1b13ccb9fa266e621 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Thu, 1 Jul 2021 16:49:01 -0700 Subject: sandcrawler slugify: lower-case greek ambiguity (OCR) --- fuzzycat/cluster.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 4e70bdd..c8384c1 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -151,10 +151,20 @@ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter T with stroke}': 'T', '\N{Latin small letter t with stroke}': 't', - # bnewbold additions + # bnewbold additions; mostly Latin-ish OCR ambiguous '\N{MICRO SIGN}': 'u', '\N{LATIN SMALL LETTER C}': 'c', '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', + '\N{Greek Small Letter Alpha}': 'a', + '\N{Greek Small Letter Beta}': 'b', + '\N{Greek Small Letter Iota}': 'i', + '\N{Greek Small Letter Kappa}': 'k', + '\N{Greek Small Letter Chi}': 'x', + '\N{Greek Small Letter Upsilon}': 'u', + '\N{Greek Small Letter Nu}': 'v', + '\N{Greek Small Letter Gamma}': 'y', + '\N{Greek Small Letter Tau}': 't', + '\N{Greek Small Letter Omicron}': 'o', # bnewbold map-to-null (for non-printing stuff not in the regex) '\N{PARTIAL DIFFERENTIAL}': '', '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', @@ -193,7 +203,7 @@ def sandcrawler_slugify(raw: str) -> str: slug = slug.replace("'", "'") # iterate over all chars and replace from map, if in map; then lower-case again - slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]) + slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower() # early bailout before executing regex if not slug: @@ -217,6 +227,7 @@ def test_sandcrawler_slugify() -> None: ("علمية", "علمية"), ("期刊的数字", "期刊的数字"), ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("γ-Globulin", "yglobulin"), # "MICRO SIGN" ("\xb5meter", "umeter"), -- cgit v1.2.3