aboutsummaryrefslogtreecommitdiffstats
path: root/skate/nysiis.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/nysiis.go')
-rw-r--r--skate/nysiis.go202
1 files changed, 202 insertions, 0 deletions
diff --git a/skate/nysiis.go b/skate/nysiis.go
new file mode 100644
index 0000000..34cd5d7
--- /dev/null
+++ b/skate/nysiis.go
@@ -0,0 +1,202 @@
+package skate
+
+import "strings"
+
+type runestring []rune
+
+// A safe way to index a runestring. It will return a null rune if you try
+// to index outside of the bounds of the runestring.
+func (r *runestring) SafeAt(pos int) rune {
+ if pos < 0 || pos >= len(*r) {
+ return 0
+ } else {
+ return (*r)[pos]
+ }
+}
+
+// A safe way to obtain a substring of a runestring. It will return a null
+// string ("") if you index somewhere outside its bounds.
+func (r *runestring) SafeSubstr(pos int, length int) string {
+ if pos < 0 || pos > len(*r) || (pos+length) > len(*r) {
+ return ""
+ } else {
+ return string((*r)[pos : pos+length])
+ }
+}
+
+// Delete characters at positions pos. It will do nothing if you provide
+// an index outside the bounds of the runestring.
+func (r *runestring) Del(pos ...int) {
+ for _, i := range pos {
+ if i >= 0 && i <= len(*r) {
+ *r = append((*r)[:i], (*r)[i+1:]...)
+ }
+ }
+}
+
+// A helper to determine if any substrings exist within the given runestring.
+func (r *runestring) Contains(start int, length int, criteria ...string) bool {
+ substring := r.SafeSubstr(start, length)
+ for _, c := range criteria {
+ if substring == c {
+ return true
+ }
+ }
+ return false
+}
+
+func cleanInput(input string) string {
+ return strings.ToUpper(strings.TrimSpace(input))
+}
+
+func isVowelNoY(c rune) bool {
+ switch c {
+ case 'A', 'E', 'I', 'O', 'U':
+ return true
+ default:
+ return false
+ }
+}
+
+// NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a
+// modification of the traditional Soundex algorithm.
+func NYSIIS(s1 string) string {
+ cleans1 := runestring(cleanInput(s1))
+ input := runestring(make([]rune, 0, len(s1)))
+ // The output can't be larger than the string itself
+ output := runestring(make([]rune, 0, len(s1)))
+ // 0. Remove all non-ASCII characters
+ for _, v := range cleans1 {
+ if v >= 65 && v <= 90 {
+ input = append(input, v)
+ }
+ }
+ if len(input) == 0 {
+ return ""
+ }
+ // 1. Transcoding first characters
+ switch input[0] {
+ case 'M':
+ if input.SafeSubstr(0, 3) == "MAC" {
+ // MAC -> MCC
+ input[1] = 'C'
+ }
+ case 'K':
+ if input.SafeSubstr(0, 2) == "KN" {
+ // KN -> NN
+ input[0] = 'N'
+ } else {
+ // K -> C
+ input[0] = 'C'
+ }
+ case 'P':
+ next := input.SafeAt(1)
+ if next == 'H' {
+ // PH -> FF
+ input[0] = 'F'
+ input[1] = 'F'
+ } else if next == 'F' {
+ // PF -> FF
+ input[0] = 'F'
+ }
+ case 'S':
+ if input.SafeSubstr(0, 3) == "SCH" {
+ input[1] = 'S'
+ input[2] = 'S'
+ }
+ }
+ // 2. Transcoding last characters
+ switch input.SafeSubstr(len(input)-2, 2) {
+ case "EE", "IE":
+ // EE, IE -> Y
+ input.Del(len(input) - 2)
+ input[len(input)-1] = 'Y'
+ case "DT", "RT", "RD", "NT", "ND":
+ // DT, RT, RD, NT, ND -> D
+ input.Del(len(input) - 2)
+ input[len(input)-1] = 'D'
+ }
+ // 3. First character of key = first character of name
+ output = append(output, input[0])
+ last := input[0]
+ for i := 1; i < len(input); i++ {
+ c := input[i]
+ switch c {
+ case 'A', 'I', 'O', 'U':
+ // A, E, I, O, U -> A (E is separate)
+ input[i] = 'A'
+ case 'E':
+ // EV -> AF, else A
+ if input.SafeAt(i+1) == 'V' {
+ input[i+1] = 'F'
+ }
+ input[i] = 'A'
+ case 'Q':
+ // Q -> G
+ input[i] = 'G'
+ case 'Z':
+ // Z -> S
+ input[i] = 'S'
+ case 'M':
+ // M -> N
+ input[i] = 'N'
+ case 'K':
+ // KN -> N, else K -> C
+ if input.SafeAt(i+1) == 'N' {
+ input.Del(i)
+ } else {
+ input[i] = 'C'
+ }
+ case 'S':
+ // SCH -> SSS
+ if input.SafeSubstr(i, 3) == "SCH" {
+ input[i+1] = 'S'
+ input[i+2] = 'S'
+ }
+ case 'P':
+ // PH -> FF
+ if input.SafeAt(i+1) == 'H' {
+ input[i] = 'F'
+ input[i+1] = 'F'
+ }
+ case 'H':
+ // H -> $(previous character) if previous character or
+ // next character is a non-vowel
+ prev := input.SafeAt(i - 1)
+ next := input.SafeAt(i + 1)
+ if !isVowelNoY(prev) || !isVowelNoY(next) {
+ input[i] = prev
+ }
+ case 'W':
+ prev := input.SafeAt(i - 1)
+ if isVowelNoY(prev) {
+ input[i] = prev
+ }
+ }
+ if input[i] != last && input[i] != 0 {
+ output = append(output, input[i])
+ }
+ last = input[i]
+ }
+ // have to be careful here because we've already added the first
+ // key value
+ if len(output) > 1 {
+ // remove trailing s
+ if output.SafeAt(len(output)-1) == 'S' {
+ output.Del(len(output) - 1)
+ }
+ // trailing AY -> Y
+ if len(output) > 2 && output.SafeSubstr(len(output)-2, 2) == "AY" {
+ output.Del(len(output) - 2)
+ }
+ // trailing A -> remove it
+ if output.SafeAt(len(output)-1) == 'A' {
+ output.Del(len(output) - 1)
+ }
+ }
+ if len(output) > 6 {
+ return string(output[0:6])
+ } else {
+ return string(output)
+ }
+}