aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-map
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-27 13:17:58 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-27 13:17:58 +0200
commit9a07523ddb1b1afae67cf52e5ca264b755a8e494 (patch)
treeb46bb52293ee4362296ad9f3ff8b1d0857c29eeb /skate/cmd/skate-map
parente5b01062cec62216fb7c4f0806f2d997f70097f8 (diff)
downloadrefcat-9a07523ddb1b1afae67cf52e5ca264b755a8e494.tar.gz
refcat-9a07523ddb1b1afae67cf52e5ca264b755a8e494.zip
minor tweaks and doc improvements
Diffstat (limited to 'skate/cmd/skate-map')
-rw-r--r--skate/cmd/skate-map/main.go26
1 files changed, 13 insertions, 13 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index f87c02f..a437705 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -2,9 +2,9 @@
// extract a key from a json document. For simple cases, you can use `jq` and
// other tools. Some key derivations require a bit more, hence a dedicated program.
//
-// An example with mostly unix tools. We want to extract (DOI, doc) tuples from
-// newline delimited JSON and sort by it; we also want to do this fast, hence
-// parallel, LC_ALL, etc.
+// An example with mostly unix tools. We want to extract (DOI, doc) tuples
+// (sorted by DOI) from newline delimited JSON; we also want to do this fast,
+// hence GNU parallel, LC_ALL, etc.
//
// $ zstdcat -T0 file.zst | (1)
// LC_ALL=C tr -d '\t' | (2) *
@@ -20,7 +20,7 @@
// (1) zstd is fast! "~4x faster than zlib" (https://is.gd/HT1DUs)
// (2) we use tab as column separator and we want clean this up before (could
// be skipped, if we limit number of splits)
-// (3) we pass the data to jq, with a bit larger buffer (default is 1MB)
+// (3) we pass the data to jq, with a bit larger buffer for GNU parallel (default is 1MB, currently)
// (4) we want no "null" output
// (5) tostring prints the input as string, because we need to carry the document forward ...
// (6) ... but we'll need some cleanup, too
@@ -28,9 +28,9 @@
// (8) a custom filter to normalize a DOI in a specific column
// (9) sorting by DOI
//
-// This is reasonably fast, but some cleanup is ugly. We also want more complex
-// keys, e.g. more normalizations, etc; in short: we'd like to encapsulate (2)
-// to (8) with `skate-map`.
+// This is reasonably fast, but some data cleanup code is ugly. We also want
+// more complex keys, e.g. more normalizations, etc; in short: we'd like to
+// encapsulate (2) to (8) with `skate-map`.
package main
import (
@@ -92,19 +92,19 @@ func main() {
}
switch {
case *mapperName != "":
- if mapf, ok := availableMappers[*mapperName]; !ok {
+ if mapper, ok := availableMappers[*mapperName]; !ok {
log.Fatalf("unknown mapper name: %v", *mapperName)
} else {
if *skipOnEmpty > 0 {
- mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty-1)
+ mapper = skate.WithSkipOnEmpty(mapper, *skipOnEmpty-1)
}
if *keyPrefix != "" {
- mapf = skate.WithPrefix(mapf, *keyPrefix)
+ mapper = skate.WithPrefix(mapper, *keyPrefix)
}
if *bestEffort {
- mapf = skate.WithBestEffort(mapf)
+ mapper = skate.WithBestEffort(mapper)
}
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV)
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapper.AsTSV)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
pp.Verbose = *verbose
@@ -115,9 +115,9 @@ func main() {
default:
fmt.Println(help)
w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0)
+ defer w.Flush()
for k, v := range availableMappers {
fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v))
}
- w.Flush()
}
}