1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
|
package skate
import (
"bytes"
"fmt"
"reflect"
"runtime"
"strings"
json "github.com/segmentio/encoding/json"
)
type Mapper func([]byte) ([]byte, error)
// NameOf returns name of value, e.g. the name of a function.
func NameOf(f interface{}) string {
v := reflect.ValueOf(f)
if v.Kind() == reflect.Func {
if rf := runtime.FuncForPC(v.Pointer()); rf != nil {
return rf.Name()
}
}
return v.String()
}
// Identity mapper.
func Identity(p []byte) ([]byte, error) {
return p, nil
}
// CreateFixedFieldFunc creates an extractor function given a json path.
// Currently only top level key is supported.
func CreateFixedFieldFunc(path string) Mapper {
f := func(p []byte) ([]byte, error) {
var doc map[string]interface{}
if err := json.Unmarshal(p, &doc); err != nil {
return nil, err
}
v, ok := doc[path]
if !ok {
return nil, nil
}
switch t := v.(type) {
case string:
return []byte(fmt.Sprintf("%v\t%s", t, p)), nil
case int, int64, float32, float64:
return []byte(fmt.Sprintf("%v\t%s", t, p)), nil
default:
return nil, nil
}
}
return f
}
func MapperTitle(p []byte) ([]byte, error) {
var doc struct {
Title string
}
if err := json.Unmarshal(p, &doc); err != nil {
return nil, err
}
title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}
func MapperTitleNormalized(p []byte) ([]byte, error) {
var doc struct {
Title string
}
if err := json.Unmarshal(p, &doc); err != nil {
return nil, err
}
title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
title = strings.ToLower(title)
title = repeatedWs.ReplaceAllString(title, " ")
title = nonWord.ReplaceAllString(title, "")
return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}
func MapperTitleNysiis(p []byte) ([]byte, error) {
var doc struct {
Title string
}
if err := json.Unmarshal(p, &doc); err != nil {
return nil, err
}
title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
title = NYSIIS(title)
return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}
func MapperTitleSandcrawler(p []byte) ([]byte, error) {
var doc struct {
Title string
}
if err := json.Unmarshal(p, &doc); err != nil {
return nil, err
}
title := sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(doc.Title)))
return bytes.Join([][]byte{[]byte(title), p}, []byte("\t")), nil
}
|