aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-30 23:23:32 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-30 23:23:32 +0200
commit45eed4462d234f8502e38b0e98e205e341188072 (patch)
tree972fddadfb5f3cf1aa058268df52648abac7764f /skate/cmd
parentbe4c76e139551f56be9b7bcb96997904ed161075 (diff)
downloadrefcat-45eed4462d234f8502e38b0e98e205e341188072.tar.gz
refcat-45eed4462d234f8502e38b0e98e205e341188072.zip
implement a few flags as mapper middleware
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-map/main.go40
1 files changed, 30 insertions, 10 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 2517878..67fc62b 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -2,6 +2,10 @@
// extract a key from a json document. For simple cases, you can use `jq` and
// other tools. Some key derivations require a bit more.
//
+// This tool helps us to find similar things in billions of items by mapping
+// docs to key. All docs that share a key are considered match candidates and can be
+// post-processed, e.g. to verify matches or to generate output schemas.
+//
// An example with mostly unix tools. We want to extract the DOI and sort by
// it; we also want to do this fast, hence parallel, LC_ALL, etc.
//
@@ -29,7 +33,6 @@
//
// This is reasonably fast, but some cleanup is ugly. We also want more complex
// keys, e.g. more normalizations, etc. We'd like to encapsulate (2) to (8).
-
package main
import (
@@ -45,12 +48,15 @@ import (
)
var (
- mapperName = flag.String("m", "", "mapper to run")
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 50000, "batch size")
- verbose = flag.Bool("verbose", false, "show progress")
- keyPrefix = flag.String("p", "", "a key prefix to use")
- extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+ mapperName = flag.String("m", "", "mapper to run")
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 50000, "batch size")
+ verbose = flag.Bool("verbose", false, "show progress")
+ keyPrefix = flag.String("p", "", "a key prefix to use")
+ extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
+ bestEffort = flag.Bool("B", false, "best effort")
+ logFile = flag.String("log", "", "log filename")
+ skipOnEmpty = flag.Int("skip-on-empty", -1, "omit docs with empty value in given field, zero indexed")
)
func main() {
@@ -67,15 +73,29 @@ func main() {
"ty": skate.MapperTitleNysiis,
"ts": skate.MapperTitleSandcrawler,
}
+ if *logFile != "" {
+ f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ log.SetOutput(f)
+ }
switch {
case *mapperName != "":
- if f, ok := availableMappers[*mapperName]; !ok {
+ if mapf, ok := availableMappers[*mapperName]; !ok {
log.Fatalf("unknown mapper name: %v", *mapperName)
} else {
+ if *skipOnEmpty >= 0 {
+ mapf = skate.WithSkipOnEmpty(mapf, *skipOnEmpty)
+ }
if *keyPrefix != "" {
- f = skate.WithPrefix(f, *keyPrefix)
+ mapf = skate.WithPrefix(mapf, *keyPrefix)
+ }
+ if *bestEffort {
+ mapf = skate.WithBestEffort(mapf)
}
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV)
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, mapf.AsTSV)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
pp.Verbose = *verbose