diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-09 13:26:35 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-09 13:26:35 +0200 |
commit | 002764b5b1f8f27bd8ae42d33b2a6f42a2a4b7a1 (patch) | |
tree | 38b7aa860202812f951ee5dc86da3a79200258ff /fuzzycat/cluster.py | |
parent | f9ef1c989b4f85c81ac5f24b08f0d636636e7a4b (diff) | |
parent | e05f4c4973fc3573d3707d4d90779fad094ced6f (diff) | |
download | fuzzycat-002764b5b1f8f27bd8ae42d33b2a6f42a2a4b7a1.tar.gz fuzzycat-002764b5b1f8f27bd8ae42d33b2a6f42a2a4b7a1.zip |
Merge branch 'master' of git.archive.org:webgroup/fuzzycat
* 'master' of git.archive.org:webgroup/fuzzycat:
simplify README for general audience; move some content to notes
sandcrawler slugify: lower-case greek ambiguity (OCR)
DOI clean/normalize helper; and use in verification etc
verify: page count parsing and comparison improvements
Diffstat (limited to 'fuzzycat/cluster.py')
-rw-r--r-- | fuzzycat/cluster.py | 15 |
1 files changed, 13 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py index 4e70bdd..c8384c1 100644 --- a/fuzzycat/cluster.py +++ b/fuzzycat/cluster.py @@ -151,10 +151,20 @@ SANDCRAWLER_CHAR_MAP = { '\N{Latin capital letter T with stroke}': 'T', '\N{Latin small letter t with stroke}': 't', - # bnewbold additions + # bnewbold additions; mostly Latin-ish OCR ambiguous '\N{MICRO SIGN}': 'u', '\N{LATIN SMALL LETTER C}': 'c', '\N{LATIN SMALL LETTER F WITH HOOK}': 'f', + '\N{Greek Small Letter Alpha}': 'a', + '\N{Greek Small Letter Beta}': 'b', + '\N{Greek Small Letter Iota}': 'i', + '\N{Greek Small Letter Kappa}': 'k', + '\N{Greek Small Letter Chi}': 'x', + '\N{Greek Small Letter Upsilon}': 'u', + '\N{Greek Small Letter Nu}': 'v', + '\N{Greek Small Letter Gamma}': 'y', + '\N{Greek Small Letter Tau}': 't', + '\N{Greek Small Letter Omicron}': 'o', # bnewbold map-to-null (for non-printing stuff not in the regex) '\N{PARTIAL DIFFERENTIAL}': '', '\N{LATIN LETTER INVERTED GLOTTAL STOP}': '', @@ -193,7 +203,7 @@ def sandcrawler_slugify(raw: str) -> str: slug = slug.replace("'", "'") # iterate over all chars and replace from map, if in map; then lower-case again - slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]) + slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower() # early bailout before executing regex if not slug: @@ -217,6 +227,7 @@ def test_sandcrawler_slugify() -> None: ("علمية", "علمية"), ("期刊的数字", "期刊的数字"), ("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"), + ("γ-Globulin", "yglobulin"), # "MICRO SIGN" ("\xb5meter", "umeter"), |