1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
|
package zipkey
import (
"runtime"
"sync"
)
const (
defaultBatchSize = 2000
maxGroupSize = 500 // short circuiting threshold
)
// Batcher runs reducers (over groups) in parallel on batches. This can have a
// significant impact on runtime, e.g. for certain datasets and operations we
// saw a reduction of 80% in processing time.
type Batcher struct {
Size int
NumWorkers int
gf groupFunc
batch []*Group
queue chan []*Group
wg sync.WaitGroup
err error
closing bool // https://stackoverflow.com/q/16105325/89391
}
// NewBatcher set ups a new Batcher with a default batch size.
func NewBatcher(gf groupFunc) *Batcher {
return NewBatcherSize(gf, defaultBatchSize)
}
// NewBatcherSize initializes a batcher with a given function to apply and a
// batch size.
func NewBatcherSize(gf groupFunc, size int) *Batcher {
batcher := Batcher{
gf: gf,
Size: size,
NumWorkers: runtime.NumCPU(),
queue: make(chan []*Group),
}
batcher.wg.Add(batcher.NumWorkers)
for i := 0; i < batcher.NumWorkers; i++ {
go batcher.worker()
}
return &batcher
}
// Close tears down the batcher. If this is not called, you get goroutine leaks
// and will miss the data from the last uncommitted batch. Calling this
// function more than once will result in a panic.
func (b *Batcher) Close() error {
b.closing = true
g := make([]*Group, len(b.batch))
copy(g, b.batch)
b.queue <- g
b.batch = nil
close(b.queue)
b.wg.Wait()
return b.err
}
// GroupFunc is a drop-in for a groupFunc. Use this function, where you used
// groupFunc before. Not thread safe. Panics if called after Close.
func (b *Batcher) GroupFunc(g *Group) error {
if b.closing {
panic("batcher: cannot call GroupFunc after Close")
}
b.batch = append(b.batch, g)
// A few groups have an extended size, e.g. thousands of members. We short
// curcuit on those to save memory.
oversized := len(g.G0) > maxGroupSize || len(g.G1) > maxGroupSize
if len(b.batch) == b.Size || oversized {
g := make([]*Group, len(b.batch))
copy(g, b.batch)
b.queue <- g
b.batch = nil
}
return b.err
}
// worker will wind down after any error has been encountered. Multiple threads
// may set the error, but we currently only care whether the error is nil or
// not.
func (b *Batcher) worker() {
defer b.wg.Done()
OUTER:
for batch := range b.queue {
for _, g := range batch {
if err := b.gf(g); err != nil {
b.err = err
break OUTER
}
}
}
}
|