diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:49:01 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-07-01 16:49:01 -0700 |
commit | 7b900c757a4306be0aeb90c1b13ccb9fa266e621 (patch) | |
tree | 1e2148cdeaef8283f174621327c4d000d171f277 | |
parent | 839c4e7f3187b4eed4b5adbb9212a9a9456bf16f (diff) | |
download | fuzzycat-7b900c757a4306be0aeb90c1b13ccb9fa266e621.tar.gz fuzzycat-7b900c757a4306be0aeb90c1b13ccb9fa266e621.zip |
sandcrawler slugify: lower-case greek ambiguity (OCR)
-rw-r--r-- | fuzzycat/cluster.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 4e70bdd..c8384c1 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -151,10 +151,20 @@ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter T with stroke}': 'T', '\N{Latin small letter t with stroke}': 't', - # bnewbold additions + # bnewbold additions; mostly Latin-ish OCR ambiguous '\N{MICRO SIGN}': 'u', '\N{LATIN SMALL LETTER C}': 'c', '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', + '\N{Greek Small Letter Alpha}': 'a', + '\N{Greek Small Letter Beta}': 'b', + '\N{Greek Small Letter Iota}': 'i', + '\N{Greek Small Letter Kappa}': 'k', + '\N{Greek Small Letter Chi}': 'x', + '\N{Greek Small Letter Upsilon}': 'u', + '\N{Greek Small Letter Nu}': 'v', + '\N{Greek Small Letter Gamma}': 'y', + '\N{Greek Small Letter Tau}': 't', + '\N{Greek Small Letter Omicron}': 'o', # bnewbold map-to-null (for non-printing stuff not in the regex) '\N{PARTIAL DIFFERENTIAL}': '', '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', @@ -193,7 +203,7 @@ def sandcrawler_slugify(raw: str) -> str: slug = slug.replace("'", "'") # iterate over all chars and replace from map, if in map; then lower-case again - slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]) + slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower() # early bailout before executing regex if not slug: @@ -217,6 +227,7 @@ def test_sandcrawler_slugify() -> None: ("علمية", "علمية"), ("期刊的数字", "期刊的数字"), ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("γ-Globulin", "yglobulin"), # "MICRO SIGN" ("\xb5meter", "umeter"), |