aboutsummaryrefslogtreecommitdiffstats
path: root/skate/matchset.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/matchset.go')
-rw-r--r--skate/matchset.go73
1 files changed, 73 insertions, 0 deletions
diff --git a/skate/matchset.go b/skate/matchset.go
new file mode 100644
index 0000000..c8882e2
--- /dev/null
+++ b/skate/matchset.go
@@ -0,0 +1,73 @@
+package skate
+
+import (
+ "bytes"
+ "index/suffixarray"
+ "io"
+ "sync"
+)
+
+const matchSetSep = "\u001F"
+
+// MatchSet allows to match a string against multiple strings at once. Rough
+// performance number: looking up ~80K different strings in 1M strings takes
+// about <2s.
+//
+// The original use case was to match ~80K+ journal name abbreviations against
+// 100M+ container name strings in references.
+type MatchSet struct {
+ index *suffixarray.Index
+ b []byte
+ sep string
+}
+
+var (
+ bufPool = sync.Pool{
+ New: func() interface{} {
+ var buf bytes.Buffer
+ return buf
+ },
+ }
+)
+
+// NewMatchSet creates a new MatchSet given a number of strings patterns. If
+// any of the given strings contains the ASCII unit separator (1F), you may get
+// unexpected results.
+func NewMatchSet(ss []string) *MatchSet {
+ buf := bufPool.Get().(bytes.Buffer)
+ buf.Reset()
+ defer bufPool.Put(buf)
+ for _, s := range ss {
+ _, _ = io.WriteString(&buf, matchSetSep)
+ _, _ = io.WriteString(&buf, s)
+ }
+ return &MatchSet{
+ b: buf.Bytes(),
+ index: suffixarray.New(buf.Bytes()),
+ sep: matchSetSep,
+ }
+}
+
+// Lookup will match a given string as against all strings in the match set. If
+// n < 0, all results are returned, otherwise at most n results are returned.
+func (ms *MatchSet) Lookup(s string, n int) (result []string) {
+ buf := bufPool.Get().(bytes.Buffer)
+ buf.Reset()
+ defer bufPool.Put(buf)
+ _, _ = io.WriteString(&buf, matchSetSep)
+ _, _ = io.WriteString(&buf, s)
+ indices := ms.index.Lookup(buf.Bytes(), n)
+ for _, i := range indices {
+ var (
+ offset = i + len(matchSetSep) + 1
+ k = bytes.Index(ms.b[offset:], []byte(matchSetSep))
+ )
+ if k == -1 {
+ k = len(ms.b)
+ } else {
+ k = k + offset
+ }
+ result = append(result, string(ms.b[i+1:k]))
+ }
+ return
+}