allow to pass in field delimiter

author: Martin Czygan <martin.czygan@gmail.com> 2021-03-31 02:18:13 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-03-31 02:18:13 +0200
commit: 9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6 (patch)
tree: 7ac3e227dd376154c1af992b9d19936f958d347c /skate
parent: 5dde9bfd1f3c2e69d18b07ecc8893840479206f6 (diff)
download: refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.tar.gz
refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.zip
3 files changed, 16 insertions, 13 deletions
diff --git a/skate/cmd/skate-biblioref/main.go b/skate/cmd/skate-biblioref/main.go
index 85b2a46..d16c99b 100644
--- a/skate/cmd/skate-biblioref/main.go
+++ b/skate/cmd/skate-biblioref/main.go
@@ -33,10 +33,10 @@ import (
 	"strings"
 	"time"
 
-	"github.com/dgraph-io/ristretto"
-	jsoniter "github.com/json-iterator/go"
 	"git.archive.org/martin/cgraph/skate"
 	"git.archive.org/martin/cgraph/skate/parallel"
+	"github.com/dgraph-io/ristretto"
+	jsoniter "github.com/json-iterator/go"
 	"github.com/sethgrid/pester"
 )
 
@@ -83,7 +83,7 @@ func main() {
 			return nil, nil
 		}
 		br := skate.BiblioRef{
-			UpdateTs:           time.Now().Unix(),
+			IndexedTs:          time.Now().Unix(),
 			SourceReleaseIdent: source,
 			TargetReleaseIdent: target,
 			MatchStatus:        matchStatus,
diff --git a/skate/cmd/skate-bref-id/main.go b/skate/cmd/skate-bref-id/main.go
index ca3d7c4..21e1e9e 100644
--- a/skate/cmd/skate-bref-id/main.go
+++ b/skate/cmd/skate-bref-id/main.go
@@ -9,9 +9,9 @@ import (
 	"runtime"
 	"time"
 
-	jsoniter "github.com/json-iterator/go"
 	"git.archive.org/martin/cgraph/skate"
 	"git.archive.org/martin/cgraph/skate/parallel"
+	jsoniter "github.com/json-iterator/go"
 )
 
 var (
@@ -29,7 +29,7 @@ func main() {
 			return nil, err
 		}
 		bref.Key = fmt.Sprintf("%s_%d", bref.SourceReleaseIdent, bref.RefIndex)
-		bref.UpdateTs = time.Now().Unix()
+		bref.IndexedTs = time.Now().Unix()
 		b, err := json.Marshal(bref)
 		b = append(b, newlineB...)
 		return b, err
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 1c8dfda..754eab8 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,6 @@
-// skate-cluster takes the output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering.
+// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
+// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// require refs and release docs in a single cluster).
 //
 // For example, this:
 //
@@ -25,12 +26,13 @@ import (
 
 var (
 	keyField       = flag.Int("k", 2, "which column contains the key (one based)")
-	docField       = flag.Int("d", 3, "which column contains the doc")
+	docField       = flag.Int("d", 3, "which column contains the doc (one based)")
 	minClusterSize = flag.Int("min", 2, "minimum cluster size")
 	maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
 	requireBoth    = flag.Bool("both", false,
 		"require at least one ref and one non-ref item present in the cluster, implies -min 2")
 	dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
+	delimiter     = flag.String("d", "\t", "field delimiter")
 )
 
 func main() {
@@ -52,7 +54,7 @@ func main() {
 		if err != nil {
 			log.Fatal(err)
 		}
-		fields = strings.Split(line, "\t")
+		fields = strings.Split(line, *delimiter)
 		if len(fields) <= keyIndex || len(fields) <= docIndex {
 			log.Fatalf("line has only %d fields", len(fields))
 		}
@@ -70,10 +72,8 @@ func main() {
 		prev = key
 		batch = append(batch, doc)
 	}
-	if len(batch) > 0 {
-		if err := writeBatch(bw, prev, batch); err != nil {
-			log.Fatal(err)
-		}
+	if err := writeBatch(bw, prev, batch); err != nil {
+		log.Fatal(err)
 	}
 }
 
@@ -93,6 +93,9 @@ func containsBoth(batch []string) bool {
 
 // writeBatch writes out a single line containing the key and the cluster values.
 func writeBatch(w io.Writer, key string, batch []string) (err error) {
+	if len(batch) == 0 {
+		return nil
+	}
 	if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
 		return nil
 	}
author	Martin Czygan <martin.czygan@gmail.com>	2021-03-31 02:18:13 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-03-31 02:18:13 +0200
commit	9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6 (patch)
tree	7ac3e227dd376154c1af992b9d19936f958d347c /skate
parent	5dde9bfd1f3c2e69d18b07ecc8893840479206f6 (diff)
download	refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.tar.gz refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.zip