aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-17 16:02:26 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-17 16:02:26 +0200
commit03a2119d0b6f0a7a76194c1aaa074b218bc90a2f (patch)
tree2d215467d958a46b1460be40723e3659e17d0c17 /skate/cmd
parent279238e1e55abd37c5f8605e9494d2810ff1b34c (diff)
downloadrefcat-03a2119d0b6f0a7a76194c1aaa074b218bc90a2f.tar.gz
refcat-03a2119d0b6f0a7a76194c1aaa074b218bc90a2f.zip
cleanup another script
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-cluster/main.go112
1 files changed, 0 insertions, 112 deletions
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
deleted file mode 100644
index de11de1..0000000
--- a/skate/cmd/skate-cluster/main.go
+++ /dev/null
@@ -1,112 +0,0 @@
-// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
-// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
-// require refs and release docs in a single cluster).
-//
-// For example, this:
-//
-// id123 somekey123 {"a":"b", ...}
-// id391 somekey123 {"x":"y", ...}
-//
-// would turn into (a single line containing all docs with the same key).
-//
-// {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]}
-//
-// A single line cluster is easier to parallelize (e.g. for verification, etc.).
-package main
-
-import (
- "bufio"
- "flag"
- "fmt"
- "io"
- "log"
- "os"
- "strings"
-)
-
-var (
- keyField = flag.Int("k", 2, "which column contains the key (one based)")
- docField = flag.Int("d", 3, "which column contains the doc (one based)")
- minClusterSize = flag.Int("min", 2, "minimum cluster size")
- maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
- requireBoth = flag.Bool("both", false,
- "require at least one ref and one non-ref item present in the cluster, implies -min 2")
- dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
- delimiter = flag.String("d", "\t", "field delimiter")
-)
-
-func main() {
- flag.Parse()
- var (
- br = bufio.NewReader(os.Stdin)
- bw = bufio.NewWriter(os.Stdout)
- prev, key, doc string
- batch, fields []string
- keyIndex = *keyField - 1
- docIndex = *docField - 1
- line string
- err error
- )
- defer bw.Flush()
- for {
- line, err = br.ReadString('\n')
- if err == io.EOF {
- break
- }
- if err != nil {
- log.Fatal(err)
- }
- fields = strings.Split(line, *delimiter)
- if len(fields) <= keyIndex || len(fields) <= docIndex {
- log.Fatalf("line has only %d fields", len(fields))
- }
- key = strings.TrimSpace(fields[keyIndex])
- if *dropEmptyKeys && len(key) == 0 {
- continue
- }
- doc = strings.TrimSpace(fields[docIndex])
- if prev != key {
- if err := writeBatch(bw, key, batch); err != nil {
- log.Fatal(err)
- }
- batch = nil
- }
- prev = key
- batch = append(batch, doc)
- }
- if err := writeBatch(bw, prev, batch); err != nil {
- log.Fatal(err)
- }
-}
-
-// containsBoth return true, if we have a ref and a non-ref item in the batch.
-func containsBoth(batch []string) bool {
- var numRef int
- for _, doc := range batch {
- // This is brittle (but faster). Most JSON should be in compact form,
- // and there the following chars are by convention added to distinguish
- // a release coming from a reference doc from other releases.
- if strings.Contains(doc, `"status":"ref"`) {
- numRef++
- }
- }
- return numRef > 0 && numRef < len(batch)
-}
-
-// writeBatch writes out a single line containing the key and the cluster values.
-func writeBatch(w io.Writer, key string, batch []string) (err error) {
- if len(batch) == 0 {
- return nil
- }
- if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
- return nil
- }
- if *requireBoth && !containsBoth(batch) {
- return nil
- }
- // This is brittle (and fast), but all items in a batch are valid JSON
- // objects, hence, the following will be valid JSON as well, or will it?
- // The key should not contain a quote.
- _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
- return
-}