diff options
Diffstat (limited to 'skate/cmd/skate-map')
-rw-r--r-- | skate/cmd/skate-map/main.go | 26 |
1 files changed, 13 insertions, 13 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index f87c02f..a437705 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -2,9 +2,9 @@ // extract a key from a json document. For simple cases, you can use `jq` and // other tools. Some key derivations require a bit more, hence a dedicated program. // -// An example with mostly unix tools. We want to extract (DOI, doc) tuples from -// newline delimited JSON and sort by it; we also want to do this fast, hence -// parallel, LC_ALL, etc. +// An example with mostly unix tools. We want to extract (DOI, doc) tuples +// (sorted by DOI) from newline delimited JSON; we also want to do this fast, +// hence GNU parallel, LC_ALL, etc. // // $ zstdcat -T0 file.zst | (1) // LC_ALL=C tr -d '\t' | (2) * @@ -20,7 +20,7 @@ // (1) zstd is fast! "~4x faster than zlib" (https://is.gd/HT1DUs) // (2) we use tab as column separator and we want clean this up before (could // be skipped, if we limit number of splits) -// (3) we pass the data to jq, with a bit larger buffer (default is 1MB) +// (3) we pass the data to jq, with a bit larger buffer for GNU parallel (default is 1MB, currently) // (4) we want no "null" output // (5) tostring prints the input as string, because we need to carry the document forward ... // (6) ... but we'll need some cleanup, too @@ -28,9 +28,9 @@ // (8) a custom filter to normalize a DOI in a specific column // (9) sorting by DOI // -// This is reasonably fast, but some cleanup is ugly. We also want more complex -// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2) -// to (8) with `skate-map`. +// This is reasonably fast, but some data cleanup code is ugly. We also want +// more complex keys, e.g. more normalizations, etc; in short: we'd like to +// encapsulate (2) to (8) with `skate-map`. package main import ( @@ -92,19 +92,19 @@ func main() { } switch { case *mapperName != "": - if mapf, ok := availableMappers[*mapperName]; !ok { + if mapper, ok := availableMappers[*mapperName]; !ok { log.Fatalf("unknown mapper name: %v", *mapperName) } else { if *skipOnEmpty > 0 { - mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty-1) + mapper = skate.WithSkipOnEmpty(mapper, *skipOnEmpty-1) } if *keyPrefix != "" { - mapf = skate.WithPrefix(mapf, *keyPrefix) + mapper = skate.WithPrefix(mapper, *keyPrefix) } if *bestEffort { - mapf = skate.WithBestEffort(mapf) + mapper = skate.WithBestEffort(mapper) } - pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV) + pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapper.AsTSV) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize pp.Verbose = *verbose @@ -115,9 +115,9 @@ func main() { default: fmt.Println(help) w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0) + defer w.Flush() for k, v := range availableMappers { fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v)) } - w.Flush() } } |