From 09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 21 Mar 2021 01:17:38 +0100 Subject: initial import of skate --- skate/cluster.go | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 skate/cluster.go (limited to 'skate/cluster.go') diff --git a/skate/cluster.go b/skate/cluster.go new file mode 100644 index 0000000..bec8154 --- /dev/null +++ b/skate/cluster.go @@ -0,0 +1,131 @@ +package skate + +import ( + "regexp" + "strings" + + jsoniter "github.com/json-iterator/go" + "golang.org/x/text/unicode/norm" +) + +// IdentifierKeyFunc returns the id and some key from a given blob. +type IdentifierKeyFunc func([]byte) (string, string, error) + +var ( + json = jsoniter.ConfigCompatibleWithStandardLibrary + wsReplacer = strings.NewReplacer("\t", " ", "\n", " ") + repeatedWs = regexp.MustCompile(`[ ]{2,}`) + nonWord = regexp.MustCompile(`[\W]+`) + + SandcrawlerCharMap = map[string]string{ + "\u00c6": "AE", + "\u00e6": "ae", + "\u00d0": "D", + "\u00f0": "d", + "\u00d8": "O", + "\u00f8": "o", + "\u00de": "Th", + "\u00fe": "th", + "\u00df": "s", + "\u0110": "D", + "\u0111": "d", + "\u0126": "H", + "\u0127": "h", + "\u0131": "i", + "\u0138": "k", + "\u0141": "L", + "\u0142": "l", + "\u014a": "N", + "\u014b": "n", + "\u0152": "Oe", + "\u0153": "oe", + "\u0166": "T", + "\u0167": "t", + "\u00b5": "u", + "c": "c", + "\u0192": "f", + "\u2202": "", + "\u0296": "", + "\u2211": "", + "\u220f": "", + "\u02c6": "", + "\u2603": "", + "\u02c7": "", + } + SandcrawlerPrefixRemove = []string{ + "original article: ", "original article ", "article: ", "title: ", + } + // SandcrawlerPrefixRemove does not have: + // InCombiningDiacriticalMarks (assume it's in "M"), + // https://unicodebook.readthedocs.io/unicode.html, + // https://stackoverflow.com/q/5697171/89391, + // https://github.com/google/re2/wiki/Syntax. + SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]") +) + +// IdentTitleDoc is a minimal subset of fields, we can work with. +type IdentTitleDoc struct { + Ident string `json:"ident"` + Title string `json:"title"` +} + +// KeyTitle is extract the title, and slight cleaning. +func KeyTitle(p []byte) (ident string, key string, err error) { + var doc IdentTitleDoc + if err = json.Unmarshal(p, &doc); err != nil { + return ident, key, err + } + title := wsReplacer.Replace(strings.TrimSpace(doc.Title)) + return doc.Ident, title, nil +} + +// KeyTitleNormalized applies further normalization. +func KeyTitleNormalized(p []byte) (ident string, key string, err error) { + ident, key, err = KeyTitle(p) + if err != nil { + return + } + key = strings.ToLower(key) + key = repeatedWs.ReplaceAllString(key, " ") + key = nonWord.ReplaceAllString(key, "") + return ident, key, nil +} + +// KeyTitleNysiis returns the New York State Identification and Intelligence +// System phonetic code for the title. +func KeyTitleNysiis(p []byte) (ident string, key string, err error) { + ident, key, err = KeyTitle(p) + if err != nil { + return + } + return ident, NYSIIS(key), nil +} + +// KeyTitleSandcrawler applies more sophisticated title cleanup. +func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) { + ident, key, err = KeyTitle(p) + if err != nil { + return + } + return ident, sandcrawlerSlugify(key), nil +} + +// sandcrawlerSlugify normalizes a string. +func sandcrawlerSlugify(s string) string { + slug := strings.ToLower(strings.TrimSpace(s)) + for _, prefix := range SandcrawlerPrefixRemove { + if strings.HasPrefix(slug, prefix) { + slug = slug[:len(prefix)] + } + } + slug = strings.ReplaceAll(slug, "'", "'") + for k, v := range SandcrawlerCharMap { + slug = strings.ReplaceAll(slug, k, v) + } + if len(slug) == 0 { + return slug + } + slug = norm.NFKD.String(slug) + slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "") + return strings.ToLower(slug) +} -- cgit v1.2.3