aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-31 02:18:13 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-03-31 02:18:13 +0200
commit9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6 (patch)
tree7ac3e227dd376154c1af992b9d19936f958d347c /skate
parent5dde9bfd1f3c2e69d18b07ecc8893840479206f6 (diff)
downloadrefcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.tar.gz
refcat-9e5024fc3eed88ffc6d1ded1f7e2044fc621efe6.zip
allow to pass in field delimiter
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-biblioref/main.go6
-rw-r--r--skate/cmd/skate-bref-id/main.go4
-rw-r--r--skate/cmd/skate-cluster/main.go19
3 files changed, 16 insertions, 13 deletions
diff --git a/skate/cmd/skate-biblioref/main.go b/skate/cmd/skate-biblioref/main.go
index 85b2a46..d16c99b 100644
--- a/skate/cmd/skate-biblioref/main.go
+++ b/skate/cmd/skate-biblioref/main.go
@@ -33,10 +33,10 @@ import (
"strings"
"time"
- "github.com/dgraph-io/ristretto"
- jsoniter "github.com/json-iterator/go"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
+ "github.com/dgraph-io/ristretto"
+ jsoniter "github.com/json-iterator/go"
"github.com/sethgrid/pester"
)
@@ -83,7 +83,7 @@ func main() {
return nil, nil
}
br := skate.BiblioRef{
- UpdateTs: time.Now().Unix(),
+ IndexedTs: time.Now().Unix(),
SourceReleaseIdent: source,
TargetReleaseIdent: target,
MatchStatus: matchStatus,
diff --git a/skate/cmd/skate-bref-id/main.go b/skate/cmd/skate-bref-id/main.go
index ca3d7c4..21e1e9e 100644
--- a/skate/cmd/skate-bref-id/main.go
+++ b/skate/cmd/skate-bref-id/main.go
@@ -9,9 +9,9 @@ import (
"runtime"
"time"
- jsoniter "github.com/json-iterator/go"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
+ jsoniter "github.com/json-iterator/go"
)
var (
@@ -29,7 +29,7 @@ func main() {
return nil, err
}
bref.Key = fmt.Sprintf("%s_%d", bref.SourceReleaseIdent, bref.RefIndex)
- bref.UpdateTs = time.Now().Unix()
+ bref.IndexedTs = time.Now().Unix()
b, err := json.Marshal(bref)
b = append(b, newlineB...)
return b, err
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
index 1c8dfda..754eab8 100644
--- a/skate/cmd/skate-cluster/main.go
+++ b/skate/cmd/skate-cluster/main.go
@@ -1,5 +1,6 @@
-// skate-cluster takes the output of skate-sorted-keys and generates a
-// "cluster" document, grouping docs by key. Can do some pre-filtering.
+// skate-cluster takes the (tab) output of skate-sorted-keys and generates a
+// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
+// require refs and release docs in a single cluster).
//
// For example, this:
//
@@ -25,12 +26,13 @@ import (
var (
keyField = flag.Int("k", 2, "which column contains the key (one based)")
- docField = flag.Int("d", 3, "which column contains the doc")
+ docField = flag.Int("d", 3, "which column contains the doc (one based)")
minClusterSize = flag.Int("min", 2, "minimum cluster size")
maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
requireBoth = flag.Bool("both", false,
"require at least one ref and one non-ref item present in the cluster, implies -min 2")
dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
+ delimiter = flag.String("d", "\t", "field delimiter")
)
func main() {
@@ -52,7 +54,7 @@ func main() {
if err != nil {
log.Fatal(err)
}
- fields = strings.Split(line, "\t")
+ fields = strings.Split(line, *delimiter)
if len(fields) <= keyIndex || len(fields) <= docIndex {
log.Fatalf("line has only %d fields", len(fields))
}
@@ -70,10 +72,8 @@ func main() {
prev = key
batch = append(batch, doc)
}
- if len(batch) > 0 {
- if err := writeBatch(bw, prev, batch); err != nil {
- log.Fatal(err)
- }
+ if err := writeBatch(bw, prev, batch); err != nil {
+ log.Fatal(err)
}
}
@@ -93,6 +93,9 @@ func containsBoth(batch []string) bool {
// writeBatch writes out a single line containing the key and the cluster values.
func writeBatch(w io.Writer, key string, batch []string) (err error) {
+ if len(batch) == 0 {
+ return nil
+ }
if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
return nil
}