diff options
Diffstat (limited to 'skate/map.go')
-rw-r--r-- | skate/map.go | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/skate/map.go b/skate/map.go index ef9c018..53aed1e 100644 --- a/skate/map.go +++ b/skate/map.go @@ -4,11 +4,13 @@ import ( "bytes" "errors" "reflect" + "regexp" "runtime" "strings" "github.com/segmentio/encoding/json" "github.com/tidwall/gjson" + "golang.org/x/text/unicode/norm" ) var ( @@ -17,6 +19,55 @@ var ( ErrZeroFields = errors.New("zero fields") ErrMissingFieldName = errors.New("missing field name") + + wsReplacer = strings.NewReplacer("\t", " ", "\n", " ") + repeatedWs = regexp.MustCompile(`[ ]{2,}`) + nonWord = regexp.MustCompile(`[\W]+`) + + SandcrawlerCharMap = map[string]string{ + "\u00c6": "AE", + "\u00e6": "ae", + "\u00d0": "D", + "\u00f0": "d", + "\u00d8": "O", + "\u00f8": "o", + "\u00de": "Th", + "\u00fe": "th", + "\u00df": "s", + "\u0110": "D", + "\u0111": "d", + "\u0126": "H", + "\u0127": "h", + "\u0131": "i", + "\u0138": "k", + "\u0141": "L", + "\u0142": "l", + "\u014a": "N", + "\u014b": "n", + "\u0152": "Oe", + "\u0153": "oe", + "\u0166": "T", + "\u0167": "t", + "\u00b5": "u", + "c": "c", + "\u0192": "f", + "\u2202": "", + "\u0296": "", + "\u2211": "", + "\u220f": "", + "\u02c6": "", + "\u2603": "", + "\u02c7": "", + } + SandcrawlerPrefixRemove = []string{ + "original article: ", "original article ", "article: ", "title: ", + } + // SandcrawlerPrefixRemove does not have: + // InCombiningDiacriticalMarks (assume it's in "M"), + // https://unicodebook.readthedocs.io/unicode.html, + // https://stackoverflow.com/q/5697171/89391, + // https://github.com/google/re2/wiki/Syntax. + SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]") ) // TitleDoc is a document with a title. @@ -342,3 +393,23 @@ func MapperPartial(p []byte) (fields [][]byte, err error) { // TODO: Group by some normlized container name or identifier. return nil, nil } + +// sandcrawlerSlugify normalizes a string. +func sandcrawlerSlugify(s string) string { + slug := strings.ToLower(strings.TrimSpace(s)) + for _, prefix := range SandcrawlerPrefixRemove { + if strings.HasPrefix(slug, prefix) { + slug = slug[:len(prefix)] + } + } + slug = strings.ReplaceAll(slug, "'", "'") + for k, v := range SandcrawlerCharMap { + slug = strings.ReplaceAll(slug, k, v) + } + if len(slug) == 0 { + return slug + } + slug = norm.NFKD.String(slug) + slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "") + return strings.ToLower(slug) +} |