aboutsummaryrefslogtreecommitdiffstats
path: root/skate/map.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/map.go')
-rw-r--r--skate/map.go71
1 files changed, 71 insertions, 0 deletions
diff --git a/skate/map.go b/skate/map.go
index ef9c018..53aed1e 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -4,11 +4,13 @@ import (
"bytes"
"errors"
"reflect"
+ "regexp"
"runtime"
"strings"
"github.com/segmentio/encoding/json"
"github.com/tidwall/gjson"
+ "golang.org/x/text/unicode/norm"
)
var (
@@ -17,6 +19,55 @@ var (
ErrZeroFields = errors.New("zero fields")
ErrMissingFieldName = errors.New("missing field name")
+
+ wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
+ repeatedWs = regexp.MustCompile(`[ ]{2,}`)
+ nonWord = regexp.MustCompile(`[\W]+`)
+
+ SandcrawlerCharMap = map[string]string{
+ "\u00c6": "AE",
+ "\u00e6": "ae",
+ "\u00d0": "D",
+ "\u00f0": "d",
+ "\u00d8": "O",
+ "\u00f8": "o",
+ "\u00de": "Th",
+ "\u00fe": "th",
+ "\u00df": "s",
+ "\u0110": "D",
+ "\u0111": "d",
+ "\u0126": "H",
+ "\u0127": "h",
+ "\u0131": "i",
+ "\u0138": "k",
+ "\u0141": "L",
+ "\u0142": "l",
+ "\u014a": "N",
+ "\u014b": "n",
+ "\u0152": "Oe",
+ "\u0153": "oe",
+ "\u0166": "T",
+ "\u0167": "t",
+ "\u00b5": "u",
+ "c": "c",
+ "\u0192": "f",
+ "\u2202": "",
+ "\u0296": "",
+ "\u2211": "",
+ "\u220f": "",
+ "\u02c6": "",
+ "\u2603": "",
+ "\u02c7": "",
+ }
+ SandcrawlerPrefixRemove = []string{
+ "original article: ", "original article ", "article: ", "title: ",
+ }
+ // SandcrawlerPrefixRemove does not have:
+ // InCombiningDiacriticalMarks (assume it's in "M"),
+ // https://unicodebook.readthedocs.io/unicode.html,
+ // https://stackoverflow.com/q/5697171/89391,
+ // https://github.com/google/re2/wiki/Syntax.
+ SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
)
// TitleDoc is a document with a title.
@@ -342,3 +393,23 @@ func MapperPartial(p []byte) (fields [][]byte, err error) {
// TODO: Group by some normlized container name or identifier.
return nil, nil
}
+
+// sandcrawlerSlugify normalizes a string.
+func sandcrawlerSlugify(s string) string {
+ slug := strings.ToLower(strings.TrimSpace(s))
+ for _, prefix := range SandcrawlerPrefixRemove {
+ if strings.HasPrefix(slug, prefix) {
+ slug = slug[:len(prefix)]
+ }
+ }
+ slug = strings.ReplaceAll(slug, "&apos;", "'")
+ for k, v := range SandcrawlerCharMap {
+ slug = strings.ReplaceAll(slug, k, v)
+ }
+ if len(slug) == 0 {
+ return slug
+ }
+ slug = norm.NFKD.String(slug)
+ slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
+ return strings.ToLower(slug)
+}