diff options
Diffstat (limited to 'skate/cluster.go')
-rw-r--r-- | skate/cluster.go | 155 |
1 files changed, 0 insertions, 155 deletions
diff --git a/skate/cluster.go b/skate/cluster.go deleted file mode 100644 index 9e16b78..0000000 --- a/skate/cluster.go +++ /dev/null @@ -1,155 +0,0 @@ -package skate - -import ( - "fmt" - "regexp" - "strings" - - "github.com/segmentio/encoding/json" - "golang.org/x/text/unicode/norm" -) - -// IdentifierKeyFunc returns the id and some key from a given blob. -type IdentifierKeyFunc func([]byte) (string, string, error) - -var ( - wsReplacer = strings.NewReplacer("\t", " ", "\n", " ") - repeatedWs = regexp.MustCompile(`[ ]{2,}`) - nonWord = regexp.MustCompile(`[\W]+`) - - SandcrawlerCharMap = map[string]string{ - "\u00c6": "AE", - "\u00e6": "ae", - "\u00d0": "D", - "\u00f0": "d", - "\u00d8": "O", - "\u00f8": "o", - "\u00de": "Th", - "\u00fe": "th", - "\u00df": "s", - "\u0110": "D", - "\u0111": "d", - "\u0126": "H", - "\u0127": "h", - "\u0131": "i", - "\u0138": "k", - "\u0141": "L", - "\u0142": "l", - "\u014a": "N", - "\u014b": "n", - "\u0152": "Oe", - "\u0153": "oe", - "\u0166": "T", - "\u0167": "t", - "\u00b5": "u", - "c": "c", - "\u0192": "f", - "\u2202": "", - "\u0296": "", - "\u2211": "", - "\u220f": "", - "\u02c6": "", - "\u2603": "", - "\u02c7": "", - } - SandcrawlerPrefixRemove = []string{ - "original article: ", "original article ", "article: ", "title: ", - } - // SandcrawlerPrefixRemove does not have: - // InCombiningDiacriticalMarks (assume it's in "M"), - // https://unicodebook.readthedocs.io/unicode.html, - // https://stackoverflow.com/q/5697171/89391, - // https://github.com/google/re2/wiki/Syntax. - SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]") -) - -// IdentTitleDoc is a minimal subset of fields, we can work with. -type IdentTitleDoc struct { - Ident string `json:"ident"` - Title string `json:"title"` -} - -// KeyTitle is extract the title, and slight cleaning. -func KeyTitle(p []byte) (ident string, key string, err error) { - var doc IdentTitleDoc - if err = json.Unmarshal(p, &doc); err != nil { - return ident, key, err - } - title := wsReplacer.Replace(strings.TrimSpace(doc.Title)) - return doc.Ident, title, nil -} - -// KeyTitleNormalized applies further normalization. -func KeyTitleNormalized(p []byte) (ident string, key string, err error) { - ident, key, err = KeyTitle(p) - if err != nil { - return - } - key = strings.ToLower(key) - key = repeatedWs.ReplaceAllString(key, " ") - key = nonWord.ReplaceAllString(key, "") - return ident, key, nil -} - -// KeyTitleNysiis returns the New York State Identification and Intelligence -// System phonetic code for the title. -func KeyTitleNysiis(p []byte) (ident string, key string, err error) { - ident, key, err = KeyTitle(p) - if err != nil { - return - } - return ident, NYSIIS(key), nil -} - -// KeyTitleSandcrawler applies more sophisticated title cleanup. -func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) { - ident, key, err = KeyTitle(p) - if err != nil { - return - } - return ident, sandcrawlerSlugify(key), nil -} - -// CreateFixedFieldFunc creates an extractor function given a json path. -// Currently only top level key is supported. -func CreateFixedFieldFunc(path string) IdentifierKeyFunc { - f := func(p []byte) (ident string, key string, err error) { - var doc map[string]interface{} - if err = json.Unmarshal(p, &doc); err != nil { - return - } - v, ok := doc[path] - if !ok { - return "", "", nil - } - switch t := v.(type) { - case string: - return "", t, nil - case int, int64, float32, float64: - return "", fmt.Sprintf("%v", t), nil - default: - return "", "", nil - } - } - return f -} - -// sandcrawlerSlugify normalizes a string. -func sandcrawlerSlugify(s string) string { - slug := strings.ToLower(strings.TrimSpace(s)) - for _, prefix := range SandcrawlerPrefixRemove { - if strings.HasPrefix(slug, prefix) { - slug = slug[:len(prefix)] - } - } - slug = strings.ReplaceAll(slug, "'", "'") - for k, v := range SandcrawlerCharMap { - slug = strings.ReplaceAll(slug, k, v) - } - if len(slug) == 0 { - return slug - } - slug = norm.NFKD.String(slug) - slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "") - return strings.ToLower(slug) -} |