From bd862b51161fab8a62bcae49eb7452a82dd2b70a Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 1 Jun 2021 12:53:37 +0200 Subject: add match set --- skate/matchset.go | 73 +++++++++++++++++++++++++++++++++++++++++++++ skate/matchset_test.go | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 skate/matchset.go create mode 100644 skate/matchset_test.go (limited to 'skate') diff --git a/skate/matchset.go b/skate/matchset.go new file mode 100644 index 0000000..c8882e2 --- /dev/null +++ b/skate/matchset.go @@ -0,0 +1,73 @@ +package skate + +import ( + "bytes" + "index/suffixarray" + "io" + "sync" +) + +const matchSetSep = "\u001F" + +// MatchSet allows to match a string against multiple strings at once. Rough +// performance number: looking up ~80K different strings in 1M strings takes +// about <2s. +// +// The original use case was to match ~80K+ journal name abbreviations against +// 100M+ container name strings in references. +type MatchSet struct { + index *suffixarray.Index + b []byte + sep string +} + +var ( + bufPool = sync.Pool{ + New: func() interface{} { + var buf bytes.Buffer + return buf + }, + } +) + +// NewMatchSet creates a new MatchSet given a number of strings patterns. If +// any of the given strings contains the ASCII unit separator (1F), you may get +// unexpected results. +func NewMatchSet(ss []string) *MatchSet { + buf := bufPool.Get().(bytes.Buffer) + buf.Reset() + defer bufPool.Put(buf) + for _, s := range ss { + _, _ = io.WriteString(&buf, matchSetSep) + _, _ = io.WriteString(&buf, s) + } + return &MatchSet{ + b: buf.Bytes(), + index: suffixarray.New(buf.Bytes()), + sep: matchSetSep, + } +} + +// Lookup will match a given string as against all strings in the match set. If +// n < 0, all results are returned, otherwise at most n results are returned. +func (ms *MatchSet) Lookup(s string, n int) (result []string) { + buf := bufPool.Get().(bytes.Buffer) + buf.Reset() + defer bufPool.Put(buf) + _, _ = io.WriteString(&buf, matchSetSep) + _, _ = io.WriteString(&buf, s) + indices := ms.index.Lookup(buf.Bytes(), n) + for _, i := range indices { + var ( + offset = i + len(matchSetSep) + 1 + k = bytes.Index(ms.b[offset:], []byte(matchSetSep)) + ) + if k == -1 { + k = len(ms.b) + } else { + k = k + offset + } + result = append(result, string(ms.b[i+1:k])) + } + return +} diff --git a/skate/matchset_test.go b/skate/matchset_test.go new file mode 100644 index 0000000..0436faf --- /dev/null +++ b/skate/matchset_test.go @@ -0,0 +1,81 @@ +package skate + +import ( + "reflect" + "sort" + "testing" +) + +func TestMatchSetLookup(t *testing.T) { + var cases = []struct { + s string + ps []string + n int + result []string + }{ + { + s: "", + ps: []string{}, + n: -1, + result: nil, + }, + { + s: "hello", + ps: []string{"lo"}, + n: -1, + result: nil, + }, + { + s: "hello", + ps: []string{"zz"}, + n: -1, + result: nil, + }, + { + s: "hello", + ps: []string{"h", "he", "hel"}, + n: -1, + result: nil, + }, + { + s: "hello", + ps: []string{"hello"}, + n: -1, + result: []string{"hello"}, + }, + { + s: "hello world", + ps: []string{"hello", "world"}, + n: -1, + result: nil, + }, + { + s: "he", + ps: []string{"hello", "world", "hel"}, + n: -1, + result: []string{"hello", "hel"}, + }, + { + s: "american italian proceedings", + ps: []string{ + "italian", + "american", + "american italian", + "american italian proceedings", + "proceedings", + }, + n: -1, + result: []string{"american italian proceedings"}, + }, + } + for _, c := range cases { + ms := NewMatchSet(c.ps) + result := ms.Lookup(c.s, c.n) + sort.Strings(result) + sort.Strings(c.result) + if !reflect.DeepEqual(result, c.result) { + t.Errorf("got %v, want %v -> s=%s ps=%s", + result, c.result, c.s, c.ps) + } + } +} -- cgit v1.2.3