aboutsummaryrefslogtreecommitdiffstats
path: root/fuzzycat/cluster.py
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-09 13:26:35 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-09 13:26:35 +0200
commit002764b5b1f8f27bd8ae42d33b2a6f42a2a4b7a1 (patch)
tree38b7aa860202812f951ee5dc86da3a79200258ff /fuzzycat/cluster.py
parentf9ef1c989b4f85c81ac5f24b08f0d636636e7a4b (diff)
parente05f4c4973fc3573d3707d4d90779fad094ced6f (diff)
downloadfuzzycat-002764b5b1f8f27bd8ae42d33b2a6f42a2a4b7a1.tar.gz
fuzzycat-002764b5b1f8f27bd8ae42d33b2a6f42a2a4b7a1.zip
Merge branch 'master' of git.archive.org:webgroup/fuzzycat
* 'master' of git.archive.org:webgroup/fuzzycat: simplify README for general audience; move some content to notes sandcrawler slugify: lower-case greek ambiguity (OCR) DOI clean/normalize helper; and use in verification etc verify: page count parsing and comparison improvements
Diffstat (limited to 'fuzzycat/cluster.py')
-rw-r--r--fuzzycat/cluster.py15
1 files changed, 13 insertions, 2 deletions
diff --git a/fuzzycat/cluster.py b/fuzzycat/cluster.py
index 4e70bdd..c8384c1 100644
--- a/fuzzycat/cluster.py
+++ b/fuzzycat/cluster.py
@@ -151,10 +151,20 @@ SANDCRAWLER_CHAR_MAP = {
'\N{Latin capital letter T with stroke}': 'T',
'\N{Latin small letter t with stroke}': 't',
- # bnewbold additions
+ # bnewbold additions; mostly Latin-ish OCR ambiguous
'\N{MICRO SIGN}': 'u',
'\N{LATIN SMALL LETTER C}': 'c',
'\N{LATIN SMALL LETTER F WITH HOOK}': 'f',
+ '\N{Greek Small Letter Alpha}': 'a',
+ '\N{Greek Small Letter Beta}': 'b',
+ '\N{Greek Small Letter Iota}': 'i',
+ '\N{Greek Small Letter Kappa}': 'k',
+ '\N{Greek Small Letter Chi}': 'x',
+ '\N{Greek Small Letter Upsilon}': 'u',
+ '\N{Greek Small Letter Nu}': 'v',
+ '\N{Greek Small Letter Gamma}': 'y',
+ '\N{Greek Small Letter Tau}': 't',
+ '\N{Greek Small Letter Omicron}': 'o',
# bnewbold map-to-null (for non-printing stuff not in the regex)
'\N{PARTIAL DIFFERENTIAL}': '',
'\N{LATIN LETTER INVERTED GLOTTAL STOP}': '',
@@ -193,7 +203,7 @@ def sandcrawler_slugify(raw: str) -> str:
slug = slug.replace("&apos;", "'")
# iterate over all chars and replace from map, if in map; then lower-case again
- slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug])
+ slug = ''.join([SANDCRAWLER_CHAR_MAP.get(c, c) for c in slug]).lower()
# early bailout before executing regex
if not slug:
@@ -217,6 +227,7 @@ def test_sandcrawler_slugify() -> None:
("علمية", "علمية"),
("期刊的数字", "期刊的数字"),
("les pré-impressions explorées à partir", "lespreimpressionsexploreesapartir"),
+ ("γ-Globulin", "yglobulin"),
# "MICRO SIGN"
("\xb5meter", "umeter"),