aboutsummaryrefslogtreecommitdiffstats
path: root/skate/map.go
blob: ae8b59f84f54bfdfeb195da7f66c003015008236 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
package skate

import (
	"bytes"
	"fmt"
	"reflect"
	"runtime"
	"strings"

	json "github.com/segmentio/encoding/json"
)

type Mapper func([]byte) ([]byte, error)

// NameOf returns name of value, e.g. the name of a function.
func NameOf(f interface{}) string {
	v := reflect.ValueOf(f)
	if v.Kind() == reflect.Func {
		if rf := runtime.FuncForPC(v.Pointer()); rf != nil {
			return rf.Name()
		}
	}
	return v.String()
}

// Identity mapper.
func Identity(p []byte) ([]byte, error) {
	return p, nil
}

// CreateFixedFieldFunc creates an extractor function given a json path.
// Currently only top level key is supported.
func CreateFixedFieldFunc(path string) Mapper {
	f := func(p []byte) ([]byte, error) {
		var doc map[string]interface{}
		if err := json.Unmarshal(p, &doc); err != nil {
			return nil, err
		}
		v, ok := doc[path]
		if !ok {
			return nil, nil
		}
		switch t := v.(type) {
		case string:
			return []byte(fmt.Sprintf("%v\t%s", t, p)), nil
		case int, int64, float32, float64:
			return []byte(fmt.Sprintf("%v\t%s", t, p)), nil
		default:
			return nil, nil
		}
	}
	return f
}

func MapperTitle(p []byte) ([]byte, error) {
	var doc struct {
		Title string
	}
	if err := json.Unmarshal(p, &doc); err != nil {
		return nil, err
	}
	title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
	return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}

func MapperTitleNormalized(p []byte) ([]byte, error) {
	var doc struct {
		Title string
	}
	if err := json.Unmarshal(p, &doc); err != nil {
		return nil, err
	}
	title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
	title = strings.ToLower(title)
	title = repeatedWs.ReplaceAllString(title, " ")
	title = nonWord.ReplaceAllString(title, "")
	return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}

func MapperTitleNysiis(p []byte) ([]byte, error) {
	var doc struct {
		Title string
	}
	if err := json.Unmarshal(p, &doc); err != nil {
		return nil, err
	}
	title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
	title = NYSIIS(title)
	return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}

func MapperTitleSandcrawler(p []byte) ([]byte, error) {
	var doc struct {
		Title string
	}
	if err := json.Unmarshal(p, &doc); err != nil {
		return nil, err
	}
	title := sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(doc.Title)))
	return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}