aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-01 12:53:37 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-01 12:53:37 +0200
commitbd862b51161fab8a62bcae49eb7452a82dd2b70a (patch)
treebca4e7c65ea68e162386d64c424287e926d4e125
parent99ab76153c4fef7f3efb3d58441a8ea784157750 (diff)
downloadrefcat-bd862b51161fab8a62bcae49eb7452a82dd2b70a.tar.gz
refcat-bd862b51161fab8a62bcae49eb7452a82dd2b70a.zip
add match set
-rw-r--r--skate/matchset.go73
-rw-r--r--skate/matchset_test.go81
2 files changed, 154 insertions, 0 deletions
diff --git a/skate/matchset.go b/skate/matchset.go
new file mode 100644
index 0000000..c8882e2
--- /dev/null
+++ b/skate/matchset.go
@@ -0,0 +1,73 @@
+package skate
+
+import (
+ "bytes"
+ "index/suffixarray"
+ "io"
+ "sync"
+)
+
+const matchSetSep = "\u001F"
+
+// MatchSet allows to match a string against multiple strings at once. Rough
+// performance number: looking up ~80K different strings in 1M strings takes
+// about <2s.
+//
+// The original use case was to match ~80K+ journal name abbreviations against
+// 100M+ container name strings in references.
+type MatchSet struct {
+ index *suffixarray.Index
+ b []byte
+ sep string
+}
+
+var (
+ bufPool = sync.Pool{
+ New: func() interface{} {
+ var buf bytes.Buffer
+ return buf
+ },
+ }
+)
+
+// NewMatchSet creates a new MatchSet given a number of strings patterns. If
+// any of the given strings contains the ASCII unit separator (1F), you may get
+// unexpected results.
+func NewMatchSet(ss []string) *MatchSet {
+ buf := bufPool.Get().(bytes.Buffer)
+ buf.Reset()
+ defer bufPool.Put(buf)
+ for _, s := range ss {
+ _, _ = io.WriteString(&buf, matchSetSep)
+ _, _ = io.WriteString(&buf, s)
+ }
+ return &MatchSet{
+ b: buf.Bytes(),
+ index: suffixarray.New(buf.Bytes()),
+ sep: matchSetSep,
+ }
+}
+
+// Lookup will match a given string as against all strings in the match set. If
+// n < 0, all results are returned, otherwise at most n results are returned.
+func (ms *MatchSet) Lookup(s string, n int) (result []string) {
+ buf := bufPool.Get().(bytes.Buffer)
+ buf.Reset()
+ defer bufPool.Put(buf)
+ _, _ = io.WriteString(&buf, matchSetSep)
+ _, _ = io.WriteString(&buf, s)
+ indices := ms.index.Lookup(buf.Bytes(), n)
+ for _, i := range indices {
+ var (
+ offset = i + len(matchSetSep) + 1
+ k = bytes.Index(ms.b[offset:], []byte(matchSetSep))
+ )
+ if k == -1 {
+ k = len(ms.b)
+ } else {
+ k = k + offset
+ }
+ result = append(result, string(ms.b[i+1:k]))
+ }
+ return
+}
diff --git a/skate/matchset_test.go b/skate/matchset_test.go
new file mode 100644
index 0000000..0436faf
--- /dev/null
+++ b/skate/matchset_test.go
@@ -0,0 +1,81 @@
+package skate
+
+import (
+ "reflect"
+ "sort"
+ "testing"
+)
+
+func TestMatchSetLookup(t *testing.T) {
+ var cases = []struct {
+ s string
+ ps []string
+ n int
+ result []string
+ }{
+ {
+ s: "",
+ ps: []string{},
+ n: -1,
+ result: nil,
+ },
+ {
+ s: "hello",
+ ps: []string{"lo"},
+ n: -1,
+ result: nil,
+ },
+ {
+ s: "hello",
+ ps: []string{"zz"},
+ n: -1,
+ result: nil,
+ },
+ {
+ s: "hello",
+ ps: []string{"h", "he", "hel"},
+ n: -1,
+ result: nil,
+ },
+ {
+ s: "hello",
+ ps: []string{"hello"},
+ n: -1,
+ result: []string{"hello"},
+ },
+ {
+ s: "hello world",
+ ps: []string{"hello", "world"},
+ n: -1,
+ result: nil,
+ },
+ {
+ s: "he",
+ ps: []string{"hello", "world", "hel"},
+ n: -1,
+ result: []string{"hello", "hel"},
+ },
+ {
+ s: "american italian proceedings",
+ ps: []string{
+ "italian",
+ "american",
+ "american italian",
+ "american italian proceedings",
+ "proceedings",
+ },
+ n: -1,
+ result: []string{"american italian proceedings"},
+ },
+ }
+ for _, c := range cases {
+ ms := NewMatchSet(c.ps)
+ result := ms.Lookup(c.s, c.n)
+ sort.Strings(result)
+ sort.Strings(c.result)
+ if !reflect.DeepEqual(result, c.result) {
+ t.Errorf("got %v, want %v -> s=%s ps=%s",
+ result, c.result, c.s, c.ps)
+ }
+ }
+}