aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cluster.go
blob: 9e16b78592093e5ab342ef0e7c7b4820edcbaaf7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
package skate

import (
	"fmt"
	"regexp"
	"strings"

	"github.com/segmentio/encoding/json"
	"golang.org/x/text/unicode/norm"
)

// IdentifierKeyFunc returns the id and some key from a given blob.
type IdentifierKeyFunc func([]byte) (string, string, error)

var (
	wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
	repeatedWs = regexp.MustCompile(`[ ]{2,}`)
	nonWord    = regexp.MustCompile(`[\W]+`)

	SandcrawlerCharMap = map[string]string{
		"\u00c6": "AE",
		"\u00e6": "ae",
		"\u00d0": "D",
		"\u00f0": "d",
		"\u00d8": "O",
		"\u00f8": "o",
		"\u00de": "Th",
		"\u00fe": "th",
		"\u00df": "s",
		"\u0110": "D",
		"\u0111": "d",
		"\u0126": "H",
		"\u0127": "h",
		"\u0131": "i",
		"\u0138": "k",
		"\u0141": "L",
		"\u0142": "l",
		"\u014a": "N",
		"\u014b": "n",
		"\u0152": "Oe",
		"\u0153": "oe",
		"\u0166": "T",
		"\u0167": "t",
		"\u00b5": "u",
		"c":      "c",
		"\u0192": "f",
		"\u2202": "",
		"\u0296": "",
		"\u2211": "",
		"\u220f": "",
		"\u02c6": "",
		"\u2603": "",
		"\u02c7": "",
	}
	SandcrawlerPrefixRemove = []string{
		"original article: ", "original article ", "article: ", "title: ",
	}
	// SandcrawlerPrefixRemove does not have:
	// InCombiningDiacriticalMarks (assume it's in "M"),
	// https://unicodebook.readthedocs.io/unicode.html,
	// https://stackoverflow.com/q/5697171/89391,
	// https://github.com/google/re2/wiki/Syntax.
	SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
)

// IdentTitleDoc is a minimal subset of fields, we can work with.
type IdentTitleDoc struct {
	Ident string `json:"ident"`
	Title string `json:"title"`
}

// KeyTitle is extract the title, and slight cleaning.
func KeyTitle(p []byte) (ident string, key string, err error) {
	var doc IdentTitleDoc
	if err = json.Unmarshal(p, &doc); err != nil {
		return ident, key, err
	}
	title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
	return doc.Ident, title, nil
}

// KeyTitleNormalized applies further normalization.
func KeyTitleNormalized(p []byte) (ident string, key string, err error) {
	ident, key, err = KeyTitle(p)
	if err != nil {
		return
	}
	key = strings.ToLower(key)
	key = repeatedWs.ReplaceAllString(key, " ")
	key = nonWord.ReplaceAllString(key, "")
	return ident, key, nil
}

// KeyTitleNysiis returns the New York State Identification and Intelligence
// System phonetic code for the title.
func KeyTitleNysiis(p []byte) (ident string, key string, err error) {
	ident, key, err = KeyTitle(p)
	if err != nil {
		return
	}
	return ident, NYSIIS(key), nil
}

// KeyTitleSandcrawler applies more sophisticated title cleanup.
func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
	ident, key, err = KeyTitle(p)
	if err != nil {
		return
	}
	return ident, sandcrawlerSlugify(key), nil
}

// CreateFixedFieldFunc creates an extractor function given a json path.
// Currently only top level key is supported.
func CreateFixedFieldFunc(path string) IdentifierKeyFunc {
	f := func(p []byte) (ident string, key string, err error) {
		var doc map[string]interface{}
		if err = json.Unmarshal(p, &doc); err != nil {
			return
		}
		v, ok := doc[path]
		if !ok {
			return "", "", nil
		}
		switch t := v.(type) {
		case string:
			return "", t, nil
		case int, int64, float32, float64:
			return "", fmt.Sprintf("%v", t), nil
		default:
			return "", "", nil
		}
	}
	return f
}

// sandcrawlerSlugify normalizes a string.
func sandcrawlerSlugify(s string) string {
	slug := strings.ToLower(strings.TrimSpace(s))
	for _, prefix := range SandcrawlerPrefixRemove {
		if strings.HasPrefix(slug, prefix) {
			slug = slug[:len(prefix)]
		}
	}
	slug = strings.ReplaceAll(slug, "&apos;", "'")
	for k, v := range SandcrawlerCharMap {
		slug = strings.ReplaceAll(slug, k, v)
	}
	if len(slug) == 0 {
		return slug
	}
	slug = norm.NFKD.String(slug)
	slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
	return strings.ToLower(slug)
}