diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-03-31 02:18:13 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-03-31 02:18:13 +0200 |
commit | 9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6 (patch) | |
tree | 7ac3e227dd376154c1af992b9d19936f958d347c /skate/cmd/skate-cluster | |
parent | 5dde9bfd1f3c2e69d18b07ecc8893840479206f6 (diff) | |
download | refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.tar.gz refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.zip |
allow to pass in field delimiter
Diffstat (limited to 'skate/cmd/skate-cluster')
-rw-r--r-- | skate/cmd/skate-cluster/main.go | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go index 1c8dfda..754eab8 100644 --- a/skate/cmd/skate-cluster/main.go +++ b/skate/cmd/skate-cluster/main.go @@ -1,5 +1,6 @@ -// skate-cluster takes the output of skate-sorted-keys and generates a -// "cluster" document, grouping docs by key. Can do some pre-filtering. +// skate-cluster takes the (tab) output of skate-sorted-keys and generates a +// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. +// require refs and release docs in a single cluster). // // For example, this: // @@ -25,12 +26,13 @@ import ( var ( keyField = flag.Int("k", 2, "which column contains the key (one based)") - docField = flag.Int("d", 3, "which column contains the doc") + docField = flag.Int("d", 3, "which column contains the doc (one based)") minClusterSize = flag.Int("min", 2, "minimum cluster size") maxClusterSize = flag.Int("max", 100000, "maximum cluster size") requireBoth = flag.Bool("both", false, "require at least one ref and one non-ref item present in the cluster, implies -min 2") dropEmptyKeys = flag.Bool("D", false, "drop empty keys") + delimiter = flag.String("d", "\t", "field delimiter") ) func main() { @@ -52,7 +54,7 @@ func main() { if err != nil { log.Fatal(err) } - fields = strings.Split(line, "\t") + fields = strings.Split(line, *delimiter) if len(fields) <= keyIndex || len(fields) <= docIndex { log.Fatalf("line has only %d fields", len(fields)) } @@ -70,10 +72,8 @@ func main() { prev = key batch = append(batch, doc) } - if len(batch) > 0 { - if err := writeBatch(bw, prev, batch); err != nil { - log.Fatal(err) - } + if err := writeBatch(bw, prev, batch); err != nil { + log.Fatal(err) } } @@ -93,6 +93,9 @@ func containsBoth(batch []string) bool { // writeBatch writes out a single line containing the key and the cluster values. func writeBatch(w io.Writer, key string, batch []string) (err error) { + if len(batch) == 0 { + return nil + } if len(batch) < *minClusterSize || len(batch) > *maxClusterSize { return nil } |