skate/cmd/skate-cluster/main.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107

// skate-cluster takes the output of skate-sorted-keys and generates a
// "cluster" document, grouping docs by key. Can do some pre-filtering.
//
// For example, this:
//
//     id123    somekey123    {"a":"b", ...}
//     id391    somekey123    {"x":"y", ...}
//
// would turn into (a single line containing all docs with the same key).
//
//     {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]}
//
// A single line cluster is easier to parallelize (e.g. for verification, etc.).
package main

import (
	"bufio"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"strings"
)

var (
	keyField       = flag.Int("k", 2, "which column contains the key (one based)")
	docField       = flag.Int("d", 3, "which column contains the doc")
	minClusterSize = flag.Int("min", 2, "minimum cluster size")
	maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
	requireBoth    = flag.Bool("both", false,
		"require at least one ref and one non-ref item present in the cluster, implies -min 2")
	dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
)

func main() {
	flag.Parse()
	var (
		br             = bufio.NewReader(os.Stdin)
		bw             = bufio.NewWriter(os.Stdout)
		prev, key, doc string
		batch, fields  []string
		keyIndex       = *keyField - 1
		docIndex       = *docField - 1
	)
	defer bw.Flush()
	for {
		line, err := br.ReadString('\n')
		if err == io.EOF {
			break
		}
		if err != nil {
			log.Fatal(err)
		}
		fields = strings.Split(line, "\t")
		if len(fields) <= keyIndex || len(fields) <= docIndex {
			log.Fatalf("line has only %d fields", len(fields))
		}
		key = strings.TrimSpace(fields[keyIndex])
		if *dropEmptyKeys && len(key) == 0 {
			continue
		}
		doc = strings.TrimSpace(fields[docIndex])
		if prev != key {
			if err := writeBatch(bw, key, batch); err != nil {
				log.Fatal(err)
			}
			batch = nil
		}
		prev = key
		batch = append(batch, doc)
	}
	if len(batch) > 0 {
		if err := writeBatch(bw, prev, batch); err != nil {
			log.Fatal(err)
		}
	}
}

// containsBoth return true, if we have a ref and a non-ref item in the batch.
func containsBoth(batch []string) bool {
	var isRef int
	for _, doc := range batch {
		// This is brittle. Most JSON should be in compact form, and there the
		// following chars are by convention added to distinguish a release
		// coming from a reference doc from other releases.
		if strings.Contains(doc, `"status":"ref"`) {
			isRef++
		}
	}
	return isRef > 0 && isRef < len(batch)
}

// writeBatch writes out a single line containing the key and the cluster values.
func writeBatch(w io.Writer, key string, batch []string) (err error) {
	if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
		return nil
	}
	if *requireBoth && !containsBoth(batch) {
		return nil
	}
	// This is brittle, but all items in a batch are valid JSON objects, hence,
	// the following will be valid JSON as well, or will it? The key should not
	// contain a quote.
	_, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
	return
}