aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-map/main.go
blob: c5fb798c83dfa3db6ccfc868a621c2ba89ac2062 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
// skate-map runs a given map function over input data. We mostly want to
// extract a key from a json document.
package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"runtime"
	"text/tabwriter"

	"git.archive.org/martin/cgraph/skate"
	"git.archive.org/martin/cgraph/skate/parallel"
)

var (
	mapperName = flag.String("m", "", "mapper to run")
	numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
	batchSize  = flag.Int("b", 50000, "batch size")
	verbose    = flag.Bool("verbose", false, "show progress")
	extraValue = flag.String("x", "", "extra value to pass to configurable mappers")
)

func main() {
	flag.Parse()
	// XXX: introduce prefixes
	availableMappers := map[string]skate.Mapper{
		"id":    skate.Identity,
		"ff":    skate.CreateFixedFieldFunc(*extraValue),
		"title": skate.MapperTitle,
		"tnorm": skate.MapperTitleNormalized,
		"tnysi": skate.MapperTitleNysiis,
		"tsand": skate.MapperTitleSandcrawler,
	}
	switch {
	case *mapperName != "":
		if f, ok := availableMappers[*mapperName]; !ok {
			log.Fatal("unknown mapper name: %v", *mapperName)
		} else {
			pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
			pp.NumWorkers = *numWorkers
			pp.BatchSize = *batchSize
			pp.Verbose = *verbose
			if err := pp.Run(); err != nil {
				log.Fatal(err)
			}
		}
	default:
		fmt.Println("skate-map available mappers")
		fmt.Println()
		w := tabwriter.NewWriter(os.Stdout, 0, 0, 4, ' ', 0)
		for k, v := range availableMappers {
			fmt.Fprintf(w, "%s\t%s\n", k, skate.NameOf(v))
		}
		w.Flush()
	}
}