aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cluster.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cluster.go')
-rw-r--r--skate/cluster.go155
1 files changed, 0 insertions, 155 deletions
diff --git a/skate/cluster.go b/skate/cluster.go
deleted file mode 100644
index 9e16b78..0000000
--- a/skate/cluster.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package skate
-
-import (
- "fmt"
- "regexp"
- "strings"
-
- "github.com/segmentio/encoding/json"
- "golang.org/x/text/unicode/norm"
-)
-
-// IdentifierKeyFunc returns the id and some key from a given blob.
-type IdentifierKeyFunc func([]byte) (string, string, error)
-
-var (
- wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
- repeatedWs = regexp.MustCompile(`[ ]{2,}`)
- nonWord = regexp.MustCompile(`[\W]+`)
-
- SandcrawlerCharMap = map[string]string{
- "\u00c6": "AE",
- "\u00e6": "ae",
- "\u00d0": "D",
- "\u00f0": "d",
- "\u00d8": "O",
- "\u00f8": "o",
- "\u00de": "Th",
- "\u00fe": "th",
- "\u00df": "s",
- "\u0110": "D",
- "\u0111": "d",
- "\u0126": "H",
- "\u0127": "h",
- "\u0131": "i",
- "\u0138": "k",
- "\u0141": "L",
- "\u0142": "l",
- "\u014a": "N",
- "\u014b": "n",
- "\u0152": "Oe",
- "\u0153": "oe",
- "\u0166": "T",
- "\u0167": "t",
- "\u00b5": "u",
- "c": "c",
- "\u0192": "f",
- "\u2202": "",
- "\u0296": "",
- "\u2211": "",
- "\u220f": "",
- "\u02c6": "",
- "\u2603": "",
- "\u02c7": "",
- }
- SandcrawlerPrefixRemove = []string{
- "original article: ", "original article ", "article: ", "title: ",
- }
- // SandcrawlerPrefixRemove does not have:
- // InCombiningDiacriticalMarks (assume it's in "M"),
- // https://unicodebook.readthedocs.io/unicode.html,
- // https://stackoverflow.com/q/5697171/89391,
- // https://github.com/google/re2/wiki/Syntax.
- SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
-)
-
-// IdentTitleDoc is a minimal subset of fields, we can work with.
-type IdentTitleDoc struct {
- Ident string `json:"ident"`
- Title string `json:"title"`
-}
-
-// KeyTitle is extract the title, and slight cleaning.
-func KeyTitle(p []byte) (ident string, key string, err error) {
- var doc IdentTitleDoc
- if err = json.Unmarshal(p, &doc); err != nil {
- return ident, key, err
- }
- title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
- return doc.Ident, title, nil
-}
-
-// KeyTitleNormalized applies further normalization.
-func KeyTitleNormalized(p []byte) (ident string, key string, err error) {
- ident, key, err = KeyTitle(p)
- if err != nil {
- return
- }
- key = strings.ToLower(key)
- key = repeatedWs.ReplaceAllString(key, " ")
- key = nonWord.ReplaceAllString(key, "")
- return ident, key, nil
-}
-
-// KeyTitleNysiis returns the New York State Identification and Intelligence
-// System phonetic code for the title.
-func KeyTitleNysiis(p []byte) (ident string, key string, err error) {
- ident, key, err = KeyTitle(p)
- if err != nil {
- return
- }
- return ident, NYSIIS(key), nil
-}
-
-// KeyTitleSandcrawler applies more sophisticated title cleanup.
-func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
- ident, key, err = KeyTitle(p)
- if err != nil {
- return
- }
- return ident, sandcrawlerSlugify(key), nil
-}
-
-// CreateFixedFieldFunc creates an extractor function given a json path.
-// Currently only top level key is supported.
-func CreateFixedFieldFunc(path string) IdentifierKeyFunc {
- f := func(p []byte) (ident string, key string, err error) {
- var doc map[string]interface{}
- if err = json.Unmarshal(p, &doc); err != nil {
- return
- }
- v, ok := doc[path]
- if !ok {
- return "", "", nil
- }
- switch t := v.(type) {
- case string:
- return "", t, nil
- case int, int64, float32, float64:
- return "", fmt.Sprintf("%v", t), nil
- default:
- return "", "", nil
- }
- }
- return f
-}
-
-// sandcrawlerSlugify normalizes a string.
-func sandcrawlerSlugify(s string) string {
- slug := strings.ToLower(strings.TrimSpace(s))
- for _, prefix := range SandcrawlerPrefixRemove {
- if strings.HasPrefix(slug, prefix) {
- slug = slug[:len(prefix)]
- }
- }
- slug = strings.ReplaceAll(slug, "&apos;", "'")
- for k, v := range SandcrawlerCharMap {
- slug = strings.ReplaceAll(slug, k, v)
- }
- if len(slug) == 0 {
- return slug
- }
- slug = norm.NFKD.String(slug)
- slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
- return strings.ToLower(slug)
-}