diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-03-21 01:17:38 +0100 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-03-21 01:17:38 +0100 |
commit | 09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 (patch) | |
tree | 122b474e27afbc66cba1182e983ef5c8555ed12f /skate/cmd/skate-cluster | |
parent | a7e0cf191ebf8fb499e0ab9a3b6cae45727f1286 (diff) | |
download | refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.tar.gz refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.zip |
initial import of skate
Diffstat (limited to 'skate/cmd/skate-cluster')
-rw-r--r-- | skate/cmd/skate-cluster/main.go | 107 |
1 files changed, 107 insertions, 0 deletions
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go new file mode 100644 index 0000000..1c8dfda --- /dev/null +++ b/skate/cmd/skate-cluster/main.go @@ -0,0 +1,107 @@ +// skate-cluster takes the output of skate-sorted-keys and generates a +// "cluster" document, grouping docs by key. Can do some pre-filtering. +// +// For example, this: +// +// id123 somekey123 {"a":"b", ...} +// id391 somekey123 {"x":"y", ...} +// +// would turn into (a single line containing all docs with the same key). +// +// {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]} +// +// A single line cluster is easier to parallelize (e.g. for verification, etc.). +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "strings" +) + +var ( + keyField = flag.Int("k", 2, "which column contains the key (one based)") + docField = flag.Int("d", 3, "which column contains the doc") + minClusterSize = flag.Int("min", 2, "minimum cluster size") + maxClusterSize = flag.Int("max", 100000, "maximum cluster size") + requireBoth = flag.Bool("both", false, + "require at least one ref and one non-ref item present in the cluster, implies -min 2") + dropEmptyKeys = flag.Bool("D", false, "drop empty keys") +) + +func main() { + flag.Parse() + var ( + br = bufio.NewReader(os.Stdin) + bw = bufio.NewWriter(os.Stdout) + prev, key, doc string + batch, fields []string + keyIndex = *keyField - 1 + docIndex = *docField - 1 + ) + defer bw.Flush() + for { + line, err := br.ReadString('\n') + if err == io.EOF { + break + } + if err != nil { + log.Fatal(err) + } + fields = strings.Split(line, "\t") + if len(fields) <= keyIndex || len(fields) <= docIndex { + log.Fatalf("line has only %d fields", len(fields)) + } + key = strings.TrimSpace(fields[keyIndex]) + if *dropEmptyKeys && len(key) == 0 { + continue + } + doc = strings.TrimSpace(fields[docIndex]) + if prev != key { + if err := writeBatch(bw, key, batch); err != nil { + log.Fatal(err) + } + batch = nil + } + prev = key + batch = append(batch, doc) + } + if len(batch) > 0 { + if err := writeBatch(bw, prev, batch); err != nil { + log.Fatal(err) + } + } +} + +// containsBoth return true, if we have a ref and a non-ref item in the batch. +func containsBoth(batch []string) bool { + var isRef int + for _, doc := range batch { + // This is brittle. Most JSON should be in compact form, and there the + // following chars are by convention added to distinguish a release + // coming from a reference doc from other releases. + if strings.Contains(doc, `"status":"ref"`) { + isRef++ + } + } + return isRef > 0 && isRef < len(batch) +} + +// writeBatch writes out a single line containing the key and the cluster values. +func writeBatch(w io.Writer, key string, batch []string) (err error) { + if len(batch) < *minClusterSize || len(batch) > *maxClusterSize { + return nil + } + if *requireBoth && !containsBoth(batch) { + return nil + } + // This is brittle, but all items in a batch are valid JSON objects, hence, + // the following will be valid JSON as well, or will it? The key should not + // contain a quote. + _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ",")) + return +} |