aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-cluster/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cmd/skate-cluster/main.go')
-rw-r--r--skate/cmd/skate-cluster/main.go107
1 files changed, 107 insertions, 0 deletions
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
new file mode 100644
index 0000000..1c8dfda
--- /dev/null
+++ b/skate/cmd/skate-cluster/main.go
@@ -0,0 +1,107 @@
+// skate-cluster takes the output of skate-sorted-keys and generates a
+// "cluster" document, grouping docs by key. Can do some pre-filtering.
+//
+// For example, this:
+//
+// id123 somekey123 {"a":"b", ...}
+// id391 somekey123 {"x":"y", ...}
+//
+// would turn into (a single line containing all docs with the same key).
+//
+// {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]}
+//
+// A single line cluster is easier to parallelize (e.g. for verification, etc.).
+package main
+
+import (
+ "bufio"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strings"
+)
+
+var (
+ keyField = flag.Int("k", 2, "which column contains the key (one based)")
+ docField = flag.Int("d", 3, "which column contains the doc")
+ minClusterSize = flag.Int("min", 2, "minimum cluster size")
+ maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
+ requireBoth = flag.Bool("both", false,
+ "require at least one ref and one non-ref item present in the cluster, implies -min 2")
+ dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
+)
+
+func main() {
+ flag.Parse()
+ var (
+ br = bufio.NewReader(os.Stdin)
+ bw = bufio.NewWriter(os.Stdout)
+ prev, key, doc string
+ batch, fields []string
+ keyIndex = *keyField - 1
+ docIndex = *docField - 1
+ )
+ defer bw.Flush()
+ for {
+ line, err := br.ReadString('\n')
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+ fields = strings.Split(line, "\t")
+ if len(fields) <= keyIndex || len(fields) <= docIndex {
+ log.Fatalf("line has only %d fields", len(fields))
+ }
+ key = strings.TrimSpace(fields[keyIndex])
+ if *dropEmptyKeys && len(key) == 0 {
+ continue
+ }
+ doc = strings.TrimSpace(fields[docIndex])
+ if prev != key {
+ if err := writeBatch(bw, key, batch); err != nil {
+ log.Fatal(err)
+ }
+ batch = nil
+ }
+ prev = key
+ batch = append(batch, doc)
+ }
+ if len(batch) > 0 {
+ if err := writeBatch(bw, prev, batch); err != nil {
+ log.Fatal(err)
+ }
+ }
+}
+
+// containsBoth return true, if we have a ref and a non-ref item in the batch.
+func containsBoth(batch []string) bool {
+ var isRef int
+ for _, doc := range batch {
+ // This is brittle. Most JSON should be in compact form, and there the
+ // following chars are by convention added to distinguish a release
+ // coming from a reference doc from other releases.
+ if strings.Contains(doc, `"status":"ref"`) {
+ isRef++
+ }
+ }
+ return isRef > 0 && isRef < len(batch)
+}
+
+// writeBatch writes out a single line containing the key and the cluster values.
+func writeBatch(w io.Writer, key string, batch []string) (err error) {
+ if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
+ return nil
+ }
+ if *requireBoth && !containsBoth(batch) {
+ return nil
+ }
+ // This is brittle, but all items in a batch are valid JSON objects, hence,
+ // the following will be valid JSON as well, or will it? The key should not
+ // contain a quote.
+ _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
+ return
+}