diff options
-rw-r--r-- | skate/cmd/skate-biblioref/main.go | 6 | ||||
-rw-r--r-- | skate/cmd/skate-bref-id/main.go | 4 | ||||
-rw-r--r-- | skate/cmd/skate-cluster/main.go | 19 |
3 files changed, 16 insertions, 13 deletions
diff --git a/skate/cmd/skate-biblioref/main.go b/skate/cmd/skate-biblioref/main.go index 85b2a46..d16c99b 100644 --- a/skate/cmd/skate-biblioref/main.go +++ b/skate/cmd/skate-biblioref/main.go @@ -33,10 +33,10 @@ import ( "strings" "time" - "github.com/dgraph-io/ristretto" - jsoniter "github.com/json-iterator/go" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" + "github.com/dgraph-io/ristretto" + jsoniter "github.com/json-iterator/go" "github.com/sethgrid/pester" ) @@ -83,7 +83,7 @@ func main() { return nil, nil } br := skate.BiblioRef{ - UpdateTs: time.Now().Unix(), + IndexedTs: time.Now().Unix(), SourceReleaseIdent: source, TargetReleaseIdent: target, MatchStatus: matchStatus, diff --git a/skate/cmd/skate-bref-id/main.go b/skate/cmd/skate-bref-id/main.go index ca3d7c4..21e1e9e 100644 --- a/skate/cmd/skate-bref-id/main.go +++ b/skate/cmd/skate-bref-id/main.go @@ -9,9 +9,9 @@ import ( "runtime" "time" - jsoniter "github.com/json-iterator/go" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" + jsoniter "github.com/json-iterator/go" ) var ( @@ -29,7 +29,7 @@ func main() { return nil, err } bref.Key = fmt.Sprintf("%s_%d", bref.SourceReleaseIdent, bref.RefIndex) - bref.UpdateTs = time.Now().Unix() + bref.IndexedTs = time.Now().Unix() b, err := json.Marshal(bref) b = append(b, newlineB...) return b, err diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go index 1c8dfda..754eab8 100644 --- a/skate/cmd/skate-cluster/main.go +++ b/skate/cmd/skate-cluster/main.go @@ -1,5 +1,6 @@ -// skate-cluster takes the output of skate-sorted-keys and generates a -// "cluster" document, grouping docs by key. Can do some pre-filtering. +// skate-cluster takes the (tab) output of skate-sorted-keys and generates a +// "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. +// require refs and release docs in a single cluster). // // For example, this: // @@ -25,12 +26,13 @@ import ( var ( keyField = flag.Int("k", 2, "which column contains the key (one based)") - docField = flag.Int("d", 3, "which column contains the doc") + docField = flag.Int("d", 3, "which column contains the doc (one based)") minClusterSize = flag.Int("min", 2, "minimum cluster size") maxClusterSize = flag.Int("max", 100000, "maximum cluster size") requireBoth = flag.Bool("both", false, "require at least one ref and one non-ref item present in the cluster, implies -min 2") dropEmptyKeys = flag.Bool("D", false, "drop empty keys") + delimiter = flag.String("d", "\t", "field delimiter") ) func main() { @@ -52,7 +54,7 @@ func main() { if err != nil { log.Fatal(err) } - fields = strings.Split(line, "\t") + fields = strings.Split(line, *delimiter) if len(fields) <= keyIndex || len(fields) <= docIndex { log.Fatalf("line has only %d fields", len(fields)) } @@ -70,10 +72,8 @@ func main() { prev = key batch = append(batch, doc) } - if len(batch) > 0 { - if err := writeBatch(bw, prev, batch); err != nil { - log.Fatal(err) - } + if err := writeBatch(bw, prev, batch); err != nil { + log.Fatal(err) } } @@ -93,6 +93,9 @@ func containsBoth(batch []string) bool { // writeBatch writes out a single line containing the key and the cluster values. func writeBatch(w io.Writer, key string, batch []string) (err error) { + if len(batch) == 0 { + return nil + } if len(batch) < *minClusterSize || len(batch) > *maxClusterSize { return nil } |