aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cluster.go
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-21 01:17:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-03-21 01:17:38 +0100
commit09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 (patch)
tree122b474e27afbc66cba1182e983ef5c8555ed12f /skate/cluster.go
parenta7e0cf191ebf8fb499e0ab9a3b6cae45727f1286 (diff)
downloadrefcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.tar.gz
refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.zip
initial import of skate
Diffstat (limited to 'skate/cluster.go')
-rw-r--r--skate/cluster.go131
1 files changed, 131 insertions, 0 deletions
diff --git a/skate/cluster.go b/skate/cluster.go
new file mode 100644
index 0000000..bec8154
--- /dev/null
+++ b/skate/cluster.go
@@ -0,0 +1,131 @@
+package skate
+
+import (
+ "regexp"
+ "strings"
+
+ jsoniter "github.com/json-iterator/go"
+ "golang.org/x/text/unicode/norm"
+)
+
+// IdentifierKeyFunc returns the id and some key from a given blob.
+type IdentifierKeyFunc func([]byte) (string, string, error)
+
+var (
+ json = jsoniter.ConfigCompatibleWithStandardLibrary
+ wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
+ repeatedWs = regexp.MustCompile(`[ ]{2,}`)
+ nonWord = regexp.MustCompile(`[\W]+`)
+
+ SandcrawlerCharMap = map[string]string{
+ "\u00c6": "AE",
+ "\u00e6": "ae",
+ "\u00d0": "D",
+ "\u00f0": "d",
+ "\u00d8": "O",
+ "\u00f8": "o",
+ "\u00de": "Th",
+ "\u00fe": "th",
+ "\u00df": "s",
+ "\u0110": "D",
+ "\u0111": "d",
+ "\u0126": "H",
+ "\u0127": "h",
+ "\u0131": "i",
+ "\u0138": "k",
+ "\u0141": "L",
+ "\u0142": "l",
+ "\u014a": "N",
+ "\u014b": "n",
+ "\u0152": "Oe",
+ "\u0153": "oe",
+ "\u0166": "T",
+ "\u0167": "t",
+ "\u00b5": "u",
+ "c": "c",
+ "\u0192": "f",
+ "\u2202": "",
+ "\u0296": "",
+ "\u2211": "",
+ "\u220f": "",
+ "\u02c6": "",
+ "\u2603": "",
+ "\u02c7": "",
+ }
+ SandcrawlerPrefixRemove = []string{
+ "original article: ", "original article ", "article: ", "title: ",
+ }
+ // SandcrawlerPrefixRemove does not have:
+ // InCombiningDiacriticalMarks (assume it's in "M"),
+ // https://unicodebook.readthedocs.io/unicode.html,
+ // https://stackoverflow.com/q/5697171/89391,
+ // https://github.com/google/re2/wiki/Syntax.
+ SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
+)
+
+// IdentTitleDoc is a minimal subset of fields, we can work with.
+type IdentTitleDoc struct {
+ Ident string `json:"ident"`
+ Title string `json:"title"`
+}
+
+// KeyTitle is extract the title, and slight cleaning.
+func KeyTitle(p []byte) (ident string, key string, err error) {
+ var doc IdentTitleDoc
+ if err = json.Unmarshal(p, &doc); err != nil {
+ return ident, key, err
+ }
+ title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
+ return doc.Ident, title, nil
+}
+
+// KeyTitleNormalized applies further normalization.
+func KeyTitleNormalized(p []byte) (ident string, key string, err error) {
+ ident, key, err = KeyTitle(p)
+ if err != nil {
+ return
+ }
+ key = strings.ToLower(key)
+ key = repeatedWs.ReplaceAllString(key, " ")
+ key = nonWord.ReplaceAllString(key, "")
+ return ident, key, nil
+}
+
+// KeyTitleNysiis returns the New York State Identification and Intelligence
+// System phonetic code for the title.
+func KeyTitleNysiis(p []byte) (ident string, key string, err error) {
+ ident, key, err = KeyTitle(p)
+ if err != nil {
+ return
+ }
+ return ident, NYSIIS(key), nil
+}
+
+// KeyTitleSandcrawler applies more sophisticated title cleanup.
+func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
+ ident, key, err = KeyTitle(p)
+ if err != nil {
+ return
+ }
+ return ident, sandcrawlerSlugify(key), nil
+}
+
+// sandcrawlerSlugify normalizes a string.
+func sandcrawlerSlugify(s string) string {
+ slug := strings.ToLower(strings.TrimSpace(s))
+ for _, prefix := range SandcrawlerPrefixRemove {
+ if strings.HasPrefix(slug, prefix) {
+ slug = slug[:len(prefix)]
+ }
+ }
+ slug = strings.ReplaceAll(slug, "&apos;", "'")
+ for k, v := range SandcrawlerCharMap {
+ slug = strings.ReplaceAll(slug, k, v)
+ }
+ if len(slug) == 0 {
+ return slug
+ }
+ slug = norm.NFKD.String(slug)
+ slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
+ return strings.ToLower(slug)
+}