diff options
Diffstat (limited to 'skate/matchset.go')
-rw-r--r-- | skate/matchset.go | 73 |
1 files changed, 73 insertions, 0 deletions
diff --git a/skate/matchset.go b/skate/matchset.go new file mode 100644 index 0000000..c8882e2 --- /dev/null +++ b/skate/matchset.go @@ -0,0 +1,73 @@ +package skate + +import ( + "bytes" + "index/suffixarray" + "io" + "sync" +) + +const matchSetSep = "\u001F" + +// MatchSet allows to match a string against multiple strings at once. Rough +// performance number: looking up ~80K different strings in 1M strings takes +// about <2s. +// +// The original use case was to match ~80K+ journal name abbreviations against +// 100M+ container name strings in references. +type MatchSet struct { + index *suffixarray.Index + b []byte + sep string +} + +var ( + bufPool = sync.Pool{ + New: func() interface{} { + var buf bytes.Buffer + return buf + }, + } +) + +// NewMatchSet creates a new MatchSet given a number of strings patterns. If +// any of the given strings contains the ASCII unit separator (1F), you may get +// unexpected results. +func NewMatchSet(ss []string) *MatchSet { + buf := bufPool.Get().(bytes.Buffer) + buf.Reset() + defer bufPool.Put(buf) + for _, s := range ss { + _, _ = io.WriteString(&buf, matchSetSep) + _, _ = io.WriteString(&buf, s) + } + return &MatchSet{ + b: buf.Bytes(), + index: suffixarray.New(buf.Bytes()), + sep: matchSetSep, + } +} + +// Lookup will match a given string as against all strings in the match set. If +// n < 0, all results are returned, otherwise at most n results are returned. +func (ms *MatchSet) Lookup(s string, n int) (result []string) { + buf := bufPool.Get().(bytes.Buffer) + buf.Reset() + defer bufPool.Put(buf) + _, _ = io.WriteString(&buf, matchSetSep) + _, _ = io.WriteString(&buf, s) + indices := ms.index.Lookup(buf.Bytes(), n) + for _, i := range indices { + var ( + offset = i + len(matchSetSep) + 1 + k = bytes.Index(ms.b[offset:], []byte(matchSetSep)) + ) + if k == -1 { + k = len(ms.b) + } else { + k = k + offset + } + result = append(result, string(ms.b[i+1:k])) + } + return +} |