aboutsummaryrefslogtreecommitdiffstats
path: root/skate/zipkey/batch.go
blob: e8ba052e16cef11c97fc490de7a980f4e8d2ea10 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package zipkey

import (
	"runtime"
	"sync"
)

const (
	defaultBatchSize = 2000
	maxGroupSize     = 500 // short circuiting threshold
)

// Batcher runs reducers (over groups) in parallel on batches. This can have a
// significant impact on runtime, e.g. for certain datasets and operations we
// saw a reduction of 80% in processing time.
type Batcher struct {
	Size       int
	NumWorkers int
	gf         groupFunc
	batch      []*Group
	queue      chan []*Group
	wg         sync.WaitGroup
	err        error
	closing    bool // https://stackoverflow.com/q/16105325/89391
}

// NewBatcher set ups a new Batcher with a default batch size.
func NewBatcher(gf groupFunc) *Batcher {
	return NewBatcherSize(gf, defaultBatchSize)
}

// NewBatcherSize initializes a batcher with a given function to apply and a
// batch size.
func NewBatcherSize(gf groupFunc, size int) *Batcher {
	batcher := Batcher{
		gf:         gf,
		Size:       size,
		NumWorkers: runtime.NumCPU(),
		queue:      make(chan []*Group),
	}
	for i := 0; i < batcher.NumWorkers; i++ {
		batcher.wg.Add(1)
		go batcher.worker()
	}
	return &batcher
}

// Close tears down the batcher. If this is not called, you get goroutine leaks
// and will miss the data from the last uncommitted batch. Calling this
// function more than once will result in a panic.
func (b *Batcher) Close() error {
	b.closing = true
	g := make([]*Group, len(b.batch))
	copy(g, b.batch)
	b.queue <- g
	b.batch = nil
	close(b.queue)
	b.wg.Wait()
	return b.err
}

// GroupFunc is a drop-in for a groupFunc. Use this function, where you used
// groupFunc before. Not thread safe. Panics if called after Close.
func (b *Batcher) GroupFunc(g *Group) error {
	if b.closing {
		panic("batcher: cannot call GroupFunc after Close")
	}
	b.batch = append(b.batch, g)
	// A few groups have an extended size, e.g. thousands of members. We short
	// curcuit on those to save memory.
	oversized := len(g.G0) > maxGroupSize || len(g.G1) > maxGroupSize
	if len(b.batch) == b.Size || oversized {
		g := make([]*Group, len(b.batch))
		copy(g, b.batch)
		b.queue <- g
		b.batch = nil
	}
	return b.err
}

// worker will wind down after any error has been encountered. Multiple threads
// may set the error, but we currently only care whether the error is nil or
// not.
func (b *Batcher) worker() {
	defer b.wg.Done()
OUTER:
	for batch := range b.queue {
		for _, g := range batch {
			if err := b.gf(g); err != nil {
				b.err = err
				break OUTER
			}
		}
	}
}