1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
// skate-map runs a given map function over input data. We mostly want to
// extract a key from a json document.
package main
import (
"flag"
"fmt"
"log"
"os"
"runtime"
"text/tabwriter"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
)
var (
mapperName = flag.String("m", "", "mapper to run")
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 50000, "batch size")
verbose = flag.Bool("verbose", false, "show progress")
extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
)
func main() {
flag.Parse()
// XXX: introduce prefixes
availableMappers := map[string]skate.Mapper{
"id": skate.Identity,
"field": skate.WithPrefix(skate.CreateFixedMapper(*extraValue), "field"),
"title": skate.WithPrefix(skate.MapperTitle, "title"),
"tnorm": skate.WithPrefix(skate.MapperTitleNormalized, "tnorm"),
"tnysi": skate.WithPrefix(skate.MapperTitleNysiis, "tnysi"),
"tsand": skate.WithPrefix(skate.MapperTitleSandcrawler, "tsand"),
}
switch {
case *mapperName != "":
if f, ok := availableMappers[*mapperName]; !ok {
log.Fatalf("unknown mapper name: %v", *mapperName)
} else {
pp := parallel.NewProcessor(os.Stdin, os.Stdout, f.AsTSV)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
pp.Verbose = *verbose
if err := pp.Run(); err != nil {
log.Fatal(err)
}
}
default:
fmt.Println("skate-map available mappers")
fmt.Println()
w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0)
for k, v := range availableMappers {
fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v))
}
w.Flush()
}
}
|