allow to pass in field delimiter

author: Martin Czygan <martin.czygan@gmail.com> 2021-03-31 02:18:13 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-03-31 02:18:13 +0200
commit: 9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6 (patch)
tree: 7ac3e227dd376154c1af992b9d19936f958d347c /skate/cmd/skate-cluster
parent: 5dde9bfd1f3c2e69d18b07ecc8893840479206f6 (diff)
download: refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.tar.gz
refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.zip
1 files changed, 11 insertions, 8 deletions
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 1c8dfda..754eab8 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,6 @@
-// skate-cluster takes the output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering.
+// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
+// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// require refs and release docs in a single cluster).
 //
 // For example, this:
 //
@@ -25,12 +26,13 @@ import (
 
 var (
 	keyField       = flag.Int("k", 2, "which column contains the key (one based)")
-	docField       = flag.Int("d", 3, "which column contains the doc")
+	docField       = flag.Int("d", 3, "which column contains the doc (one based)")
 	minClusterSize = flag.Int("min", 2, "minimum cluster size")
 	maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
 	requireBoth    = flag.Bool("both", false,
 		"require at least one ref and one non-ref item present in the cluster, implies -min 2")
 	dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
+	delimiter     = flag.String("d", "\t", "field delimiter")
 )
 
 func main() {
@@ -52,7 +54,7 @@ func main() {
 		if err != nil {
 			log.Fatal(err)
 		}
-		fields = strings.Split(line, "\t")
+		fields = strings.Split(line, *delimiter)
 		if len(fields) <= keyIndex || len(fields) <= docIndex {
 			log.Fatalf("line has only %d fields", len(fields))
 		}
@@ -70,10 +72,8 @@ func main() {
 		prev = key
 		batch = append(batch, doc)
 	}
-	if len(batch) > 0 {
-		if err := writeBatch(bw, prev, batch); err != nil {
-			log.Fatal(err)
-		}
+	if err := writeBatch(bw, prev, batch); err != nil {
+		log.Fatal(err)
 	}
 }
 
@@ -93,6 +93,9 @@ func containsBoth(batch []string) bool {
 
 // writeBatch writes out a single line containing the key and the cluster values.
 func writeBatch(w io.Writer, key string, batch []string) (err error) {
+	if len(batch) == 0 {
+		return nil
+	}
 	if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
 		return nil
 	}
author	Martin Czygan <martin.czygan@gmail.com>	2021-03-31 02:18:13 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-03-31 02:18:13 +0200
commit	9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6 (patch)
tree	7ac3e227dd376154c1af992b9d19936f958d347c /skate/cmd/skate-cluster
parent	5dde9bfd1f3c2e69d18b07ecc8893840479206f6 (diff)
download	refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.tar.gz refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.zip