package skate import ( "bytes" "index/suffixarray" "io" "sync" ) const matchSetSep = "\u001F" // MatchSet allows to match a string against multiple strings at once. Rough // performance number: looking up ~80K different strings in 1M strings takes // about <2s. // // The original use case was to match ~80K+ journal name abbreviations against // 100M+ container name strings in references. type MatchSet struct { index *suffixarray.Index b []byte sep string } var ( bufPool = sync.Pool{ New: func() interface{} { var buf bytes.Buffer return buf }, } ) // NewMatchSet creates a new MatchSet given a number of strings patterns. If // any of the given strings contains the ASCII unit separator (1F), you may get // unexpected results. func NewMatchSet(ss []string) *MatchSet { buf := bufPool.Get().(bytes.Buffer) buf.Reset() defer bufPool.Put(buf) for _, s := range ss { _, _ = io.WriteString(&buf, matchSetSep) _, _ = io.WriteString(&buf, s) } return &MatchSet{ b: buf.Bytes(), index: suffixarray.New(buf.Bytes()), sep: matchSetSep, } } // Lookup will match a given string as against all strings in the match set. If // n < 0, all results are returned, otherwise at most n results are returned. func (ms *MatchSet) Lookup(s string, n int) (result []string) { buf := bufPool.Get().(bytes.Buffer) buf.Reset() defer bufPool.Put(buf) _, _ = io.WriteString(&buf, matchSetSep) _, _ = io.WriteString(&buf, s) indices := ms.index.Lookup(buf.Bytes(), n) for _, i := range indices { var ( offset = i + len(matchSetSep) + 1 k = bytes.Index(ms.b[offset:], []byte(matchSetSep)) ) if k == -1 { k = len(ms.b) } else { k = k + offset } result = append(result, string(ms.b[i+1:k])) } return }