skate/cluster.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130

package skate

import (
	"regexp"
	"strings"

	json "github.com/segmentio/encoding/json"
	"golang.org/x/text/unicode/norm"
)

// IdentifierKeyFunc returns the id and some key from a given blob.
type IdentifierKeyFunc func([]byte) (string, string, error)

var (
	wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
	repeatedWs = regexp.MustCompile(`[ ]{2,}`)
	nonWord    = regexp.MustCompile(`[\W]+`)

	SandcrawlerCharMap = map[string]string{
		"\u00c6": "AE",
		"\u00e6": "ae",
		"\u00d0": "D",
		"\u00f0": "d",
		"\u00d8": "O",
		"\u00f8": "o",
		"\u00de": "Th",
		"\u00fe": "th",
		"\u00df": "s",
		"\u0110": "D",
		"\u0111": "d",
		"\u0126": "H",
		"\u0127": "h",
		"\u0131": "i",
		"\u0138": "k",
		"\u0141": "L",
		"\u0142": "l",
		"\u014a": "N",
		"\u014b": "n",
		"\u0152": "Oe",
		"\u0153": "oe",
		"\u0166": "T",
		"\u0167": "t",
		"\u00b5": "u",
		"c":      "c",
		"\u0192": "f",
		"\u2202": "",
		"\u0296": "",
		"\u2211": "",
		"\u220f": "",
		"\u02c6": "",
		"\u2603": "",
		"\u02c7": "",
	}
	SandcrawlerPrefixRemove = []string{
		"original article: ", "original article ", "article: ", "title: ",
	}
	// SandcrawlerPrefixRemove does not have:
	// InCombiningDiacriticalMarks (assume it's in "M"),
	// https://unicodebook.readthedocs.io/unicode.html,
	// https://stackoverflow.com/q/5697171/89391,
	// https://github.com/google/re2/wiki/Syntax.
	SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
)

// IdentTitleDoc is a minimal subset of fields, we can work with.
type IdentTitleDoc struct {
	Ident string `json:"ident"`
	Title string `json:"title"`
}

// KeyTitle is extract the title, and slight cleaning.
func KeyTitle(p []byte) (ident string, key string, err error) {
	var doc IdentTitleDoc
	if err = json.Unmarshal(p, &doc); err != nil {
		return ident, key, err
	}
	title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
	return doc.Ident, title, nil
}

// KeyTitleNormalized applies further normalization.
func KeyTitleNormalized(p []byte) (ident string, key string, err error) {
	ident, key, err = KeyTitle(p)
	if err != nil {
		return
	}
	key = strings.ToLower(key)
	key = repeatedWs.ReplaceAllString(key, " ")
	key = nonWord.ReplaceAllString(key, "")
	return ident, key, nil
}

// KeyTitleNysiis returns the New York State Identification and Intelligence
// System phonetic code for the title.
func KeyTitleNysiis(p []byte) (ident string, key string, err error) {
	ident, key, err = KeyTitle(p)
	if err != nil {
		return
	}
	return ident, NYSIIS(key), nil
}

// KeyTitleSandcrawler applies more sophisticated title cleanup.
func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
	ident, key, err = KeyTitle(p)
	if err != nil {
		return
	}
	return ident, sandcrawlerSlugify(key), nil
}

// sandcrawlerSlugify normalizes a string.
func sandcrawlerSlugify(s string) string {
	slug := strings.ToLower(strings.TrimSpace(s))
	for _, prefix := range SandcrawlerPrefixRemove {
		if strings.HasPrefix(slug, prefix) {
			slug = slug[:len(prefix)]
		}
	}
	slug = strings.ReplaceAll(slug, "&apos;", "'")
	for k, v := range SandcrawlerCharMap {
		slug = strings.ReplaceAll(slug, k, v)
	}
	if len(slug) == 0 {
		return slug
	}
	slug = norm.NFKD.String(slug)
	slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
	return strings.ToLower(slug)
}