diff options
Diffstat (limited to 'skate/nysiis.go')
-rw-r--r-- | skate/nysiis.go | 202 |
1 files changed, 202 insertions, 0 deletions
diff --git a/skate/nysiis.go b/skate/nysiis.go new file mode 100644 index 0000000..34cd5d7 --- /dev/null +++ b/skate/nysiis.go @@ -0,0 +1,202 @@ +package skate + +import "strings" + +type runestring []rune + +// A safe way to index a runestring. It will return a null rune if you try +// to index outside of the bounds of the runestring. +func (r *runestring) SafeAt(pos int) rune { + if pos < 0 || pos >= len(*r) { + return 0 + } else { + return (*r)[pos] + } +} + +// A safe way to obtain a substring of a runestring. It will return a null +// string ("") if you index somewhere outside its bounds. +func (r *runestring) SafeSubstr(pos int, length int) string { + if pos < 0 || pos > len(*r) || (pos+length) > len(*r) { + return "" + } else { + return string((*r)[pos : pos+length]) + } +} + +// Delete characters at positions pos. It will do nothing if you provide +// an index outside the bounds of the runestring. +func (r *runestring) Del(pos ...int) { + for _, i := range pos { + if i >= 0 && i <= len(*r) { + *r = append((*r)[:i], (*r)[i+1:]...) + } + } +} + +// A helper to determine if any substrings exist within the given runestring. +func (r *runestring) Contains(start int, length int, criteria ...string) bool { + substring := r.SafeSubstr(start, length) + for _, c := range criteria { + if substring == c { + return true + } + } + return false +} + +func cleanInput(input string) string { + return strings.ToUpper(strings.TrimSpace(input)) +} + +func isVowelNoY(c rune) bool { + switch c { + case 'A', 'E', 'I', 'O', 'U': + return true + default: + return false + } +} + +// NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a +// modification of the traditional Soundex algorithm. +func NYSIIS(s1 string) string { + cleans1 := runestring(cleanInput(s1)) + input := runestring(make([]rune, 0, len(s1))) + // The output can't be larger than the string itself + output := runestring(make([]rune, 0, len(s1))) + // 0. Remove all non-ASCII characters + for _, v := range cleans1 { + if v >= 65 && v <= 90 { + input = append(input, v) + } + } + if len(input) == 0 { + return "" + } + // 1. Transcoding first characters + switch input[0] { + case 'M': + if input.SafeSubstr(0, 3) == "MAC" { + // MAC -> MCC + input[1] = 'C' + } + case 'K': + if input.SafeSubstr(0, 2) == "KN" { + // KN -> NN + input[0] = 'N' + } else { + // K -> C + input[0] = 'C' + } + case 'P': + next := input.SafeAt(1) + if next == 'H' { + // PH -> FF + input[0] = 'F' + input[1] = 'F' + } else if next == 'F' { + // PF -> FF + input[0] = 'F' + } + case 'S': + if input.SafeSubstr(0, 3) == "SCH" { + input[1] = 'S' + input[2] = 'S' + } + } + // 2. Transcoding last characters + switch input.SafeSubstr(len(input)-2, 2) { + case "EE", "IE": + // EE, IE -> Y + input.Del(len(input) - 2) + input[len(input)-1] = 'Y' + case "DT", "RT", "RD", "NT", "ND": + // DT, RT, RD, NT, ND -> D + input.Del(len(input) - 2) + input[len(input)-1] = 'D' + } + // 3. First character of key = first character of name + output = append(output, input[0]) + last := input[0] + for i := 1; i < len(input); i++ { + c := input[i] + switch c { + case 'A', 'I', 'O', 'U': + // A, E, I, O, U -> A (E is separate) + input[i] = 'A' + case 'E': + // EV -> AF, else A + if input.SafeAt(i+1) == 'V' { + input[i+1] = 'F' + } + input[i] = 'A' + case 'Q': + // Q -> G + input[i] = 'G' + case 'Z': + // Z -> S + input[i] = 'S' + case 'M': + // M -> N + input[i] = 'N' + case 'K': + // KN -> N, else K -> C + if input.SafeAt(i+1) == 'N' { + input.Del(i) + } else { + input[i] = 'C' + } + case 'S': + // SCH -> SSS + if input.SafeSubstr(i, 3) == "SCH" { + input[i+1] = 'S' + input[i+2] = 'S' + } + case 'P': + // PH -> FF + if input.SafeAt(i+1) == 'H' { + input[i] = 'F' + input[i+1] = 'F' + } + case 'H': + // H -> $(previous character) if previous character or + // next character is a non-vowel + prev := input.SafeAt(i - 1) + next := input.SafeAt(i + 1) + if !isVowelNoY(prev) || !isVowelNoY(next) { + input[i] = prev + } + case 'W': + prev := input.SafeAt(i - 1) + if isVowelNoY(prev) { + input[i] = prev + } + } + if input[i] != last && input[i] != 0 { + output = append(output, input[i]) + } + last = input[i] + } + // have to be careful here because we've already added the first + // key value + if len(output) > 1 { + // remove trailing s + if output.SafeAt(len(output)-1) == 'S' { + output.Del(len(output) - 1) + } + // trailing AY -> Y + if len(output) > 2 && output.SafeSubstr(len(output)-2, 2) == "AY" { + output.Del(len(output) - 2) + } + // trailing A -> remove it + if output.SafeAt(len(output)-1) == 'A' { + output.Del(len(output) - 1) + } + } + if len(output) > 6 { + return string(output[0:6]) + } else { + return string(output) + } +} |