aboutsummaryrefslogtreecommitdiffstats
path: root/skate/matchset.go
blob: 3ad1047a32c2609fa3dc7f1b511de2ad6ef67168 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package skate

import (
	"bytes"
	"index/suffixarray"
	"io"
	"sync"
)

// matchSetSep separates the strings we want to lookup. This is a hack to get
// this going: We build a suffix array out of "1F<s>1F<s>..." and prepend "1F"
// to the string to lookup to check for a match. This results in a behaviour
// similar to strings.HasPrefix. Multiple results must be handled by the user
// (or just work with the cases, where you get exactly one match). A related
// blog post on a similar technique:
// https://eli.thegreenplace.net/2016/suffix-arrays-in-the-go-standard-library/.
const matchSetSep = "\u001F"

// MatchSet allows to match a string against multiple strings at once. Rough
// performance: looking up ~80K different strings in 1M strings takes <2s.
//
// The original use case was to match ~80K+ journal name abbreviations against
// 100M+ container name strings in references.
type MatchSet struct {
	index *suffixarray.Index
	b     []byte
	sep   string
}

var (
	bufPool = sync.Pool{
		New: func() interface{} {
			var buf bytes.Buffer
			return buf
		},
	}
)

// NewMatchSet creates a new MatchSet given a number of strings patterns. If
// any of the given strings contains the ASCII unit separator (1F), you may get
// unexpected results.
func NewMatchSet(ss []string) *MatchSet {
	buf := bufPool.Get().(bytes.Buffer)
	buf.Reset()
	defer bufPool.Put(buf)
	for _, s := range ss {
		_, _ = io.WriteString(&buf, matchSetSep)
		_, _ = io.WriteString(&buf, s)
	}
	return &MatchSet{
		b:     buf.Bytes(),
		index: suffixarray.New(buf.Bytes()),
		sep:   matchSetSep,
	}
}

// Lookup will match a given string as against all strings in the match set. If
// n < 0, all results are returned, otherwise at most n results are returned.
func (ms *MatchSet) Lookup(s string, n int) (result []string) {
	buf := bufPool.Get().(bytes.Buffer)
	buf.Reset()
	defer bufPool.Put(buf)
	_, _ = io.WriteString(&buf, matchSetSep)
	_, _ = io.WriteString(&buf, s)
	indices := ms.index.Lookup(buf.Bytes(), n)
	for _, i := range indices {
		var (
			offset = i + len(matchSetSep) + 1
			k      = bytes.Index(ms.b[offset:], []byte(matchSetSep))
		)
		if k == -1 {
			k = len(ms.b)
		} else {
			k = k + offset
		}
		result = append(result, string(ms.b[i+1:k]))
	}
	return
}