5 files changed, 72 insertions, 311 deletions
diff --git a/skate/Makefile b/skate/Makefile
index 35cf2a9..8092cbe 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
 SHELL := /bin/bash
-TARGETS := skate-conv skate-cluster skate-cleanup skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce skate-cdx-lookup skate-resolve-journal-name
+TARGETS := skate-conv skate-cleanup skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce skate-cdx-lookup skate-resolve-journal-name
 PKGNAME := skate
 
 .PHONY: test
diff --git a/skate/cluster.go b/skate/cluster.go
deleted file mode 100644
index 9e16b78..0000000
--- a/skate/cluster.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package skate
-
-import (
-	"fmt"
-	"regexp"
-	"strings"
-
-	"github.com/segmentio/encoding/json"
-	"golang.org/x/text/unicode/norm"
-)
-
-// IdentifierKeyFunc returns the id and some key from a given blob.
-type IdentifierKeyFunc func([]byte) (string, string, error)
-
-var (
-	wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
-	repeatedWs = regexp.MustCompile(`[ ]{2,}`)
-	nonWord    = regexp.MustCompile(`[\W]+`)
-
-	SandcrawlerCharMap = map[string]string{
-		"\u00c6": "AE",
-		"\u00e6": "ae",
-		"\u00d0": "D",
-		"\u00f0": "d",
-		"\u00d8": "O",
-		"\u00f8": "o",
-		"\u00de": "Th",
-		"\u00fe": "th",
-		"\u00df": "s",
-		"\u0110": "D",
-		"\u0111": "d",
-		"\u0126": "H",
-		"\u0127": "h",
-		"\u0131": "i",
-		"\u0138": "k",
-		"\u0141": "L",
-		"\u0142": "l",
-		"\u014a": "N",
-		"\u014b": "n",
-		"\u0152": "Oe",
-		"\u0153": "oe",
-		"\u0166": "T",
-		"\u0167": "t",
-		"\u00b5": "u",
-		"c":      "c",
-		"\u0192": "f",
-		"\u2202": "",
-		"\u0296": "",
-		"\u2211": "",
-		"\u220f": "",
-		"\u02c6": "",
-		"\u2603": "",
-		"\u02c7": "",
-	}
-	SandcrawlerPrefixRemove = []string{
-		"original article: ", "original article ", "article: ", "title: ",
-	}
-	// SandcrawlerPrefixRemove does not have:
-	// InCombiningDiacriticalMarks (assume it's in "M"),
-	// https://unicodebook.readthedocs.io/unicode.html,
-	// https://stackoverflow.com/q/5697171/89391,
-	// https://github.com/google/re2/wiki/Syntax.
-	SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
-)
-
-// IdentTitleDoc is a minimal subset of fields, we can work with.
-type IdentTitleDoc struct {
-	Ident string `json:"ident"`
-	Title string `json:"title"`
-}
-
-// KeyTitle is extract the title, and slight cleaning.
-func KeyTitle(p []byte) (ident string, key string, err error) {
-	var doc IdentTitleDoc
-	if err = json.Unmarshal(p, &doc); err != nil {
-		return ident, key, err
-	}
-	title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
-	return doc.Ident, title, nil
-}
-
-// KeyTitleNormalized applies further normalization.
-func KeyTitleNormalized(p []byte) (ident string, key string, err error) {
-	ident, key, err = KeyTitle(p)
-	if err != nil {
-		return
-	}
-	key = strings.ToLower(key)
-	key = repeatedWs.ReplaceAllString(key, " ")
-	key = nonWord.ReplaceAllString(key, "")
-	return ident, key, nil
-}
-
-// KeyTitleNysiis returns the New York State Identification and Intelligence
-// System phonetic code for the title.
-func KeyTitleNysiis(p []byte) (ident string, key string, err error) {
-	ident, key, err = KeyTitle(p)
-	if err != nil {
-		return
-	}
-	return ident, NYSIIS(key), nil
-}
-
-// KeyTitleSandcrawler applies more sophisticated title cleanup.
-func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
-	ident, key, err = KeyTitle(p)
-	if err != nil {
-		return
-	}
-	return ident, sandcrawlerSlugify(key), nil
-}
-
-// CreateFixedFieldFunc creates an extractor function given a json path.
-// Currently only top level key is supported.
-func CreateFixedFieldFunc(path string) IdentifierKeyFunc {
-	f := func(p []byte) (ident string, key string, err error) {
-		var doc map[string]interface{}
-		if err = json.Unmarshal(p, &doc); err != nil {
-			return
-		}
-		v, ok := doc[path]
-		if !ok {
-			return "", "", nil
-		}
-		switch t := v.(type) {
-		case string:
-			return "", t, nil
-		case int, int64, float32, float64:
-			return "", fmt.Sprintf("%v", t), nil
-		default:
-			return "", "", nil
-		}
-	}
-	return f
-}
-
-// sandcrawlerSlugify normalizes a string.
-func sandcrawlerSlugify(s string) string {
-	slug := strings.ToLower(strings.TrimSpace(s))
-	for _, prefix := range SandcrawlerPrefixRemove {
-		if strings.HasPrefix(slug, prefix) {
-			slug = slug[:len(prefix)]
-		}
-	}
-	slug = strings.ReplaceAll(slug, "&apos;", "'")
-	for k, v := range SandcrawlerCharMap {
-		slug = strings.ReplaceAll(slug, k, v)
-	}
-	if len(slug) == 0 {
-		return slug
-	}
-	slug = norm.NFKD.String(slug)
-	slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
-	return strings.ToLower(slug)
-}
diff --git a/skate/cluster_test.go b/skate/cluster_test.go
deleted file mode 100644
index 1c8819e..0000000
--- a/skate/cluster_test.go
+++ /dev/null
@@ -1,43 +0,0 @@
-package skate
-
-import "testing"
-
-func TestKeyTitleSandcrawler(t *testing.T) {
-	var cases = []struct {
-		b     []byte
-		ident string
-		key   string
-		err   error
-	}{
-		{
-			[]byte(`{"ident": "123", "title": "abc"}`),
-			"123",
-			"abc",
-			nil,
-		},
-		{
-			[]byte(`{"ident": "123", "title": "abc++***##???ßßß"}`),
-			"123",
-			"abcsss",
-			nil,
-		},
-		{
-			[]byte(`{"ident": "123", "title": "A k"}`),
-			"123",
-			"ak",
-			nil,
-		},
-	}
-	for _, c := range cases {
-		ident, key, err := KeyTitleSandcrawler(c.b)
-		if key != c.key {
-			t.Errorf("[key] got %v, want %v", key, c.key)
-		}
-		if ident != c.ident {
-			t.Errorf("[ident] got %v, want %v", ident, c.ident)
-		}
-		if err != c.err {
-			t.Errorf("[err] got %v, want %v", err, c.err)
-		}
-	}
-}
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
deleted file mode 100644
index de11de1..0000000
--- a/skate/cmd/skate-cluster/main.go
+++ /dev/null
@@ -1,112 +0,0 @@
-// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
-// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
-// require refs and release docs in a single cluster).
-//
-// For example, this:
-//
-//     id123    somekey123    {"a":"b", ...}
-//     id391    somekey123    {"x":"y", ...}
-//
-// would turn into (a single line containing all docs with the same key).
-//
-//     {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]}
-//
-// A single line cluster is easier to parallelize (e.g. for verification, etc.).
-package main
-
-import (
-	"bufio"
-	"flag"
-	"fmt"
-	"io"
-	"log"
-	"os"
-	"strings"
-)
-
-var (
-	keyField       = flag.Int("k", 2, "which column contains the key (one based)")
-	docField       = flag.Int("d", 3, "which column contains the doc (one based)")
-	minClusterSize = flag.Int("min", 2, "minimum cluster size")
-	maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
-	requireBoth    = flag.Bool("both", false,
-		"require at least one ref and one non-ref item present in the cluster, implies -min 2")
-	dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
-	delimiter     = flag.String("d", "\t", "field delimiter")
-)
-
-func main() {
-	flag.Parse()
-	var (
-		br             = bufio.NewReader(os.Stdin)
-		bw             = bufio.NewWriter(os.Stdout)
-		prev, key, doc string
-		batch, fields  []string
-		keyIndex       = *keyField - 1
-		docIndex       = *docField - 1
-		line           string
-		err            error
-	)
-	defer bw.Flush()
-	for {
-		line, err = br.ReadString('\n')
-		if err == io.EOF {
-			break
-		}
-		if err != nil {
-			log.Fatal(err)
-		}
-		fields = strings.Split(line, *delimiter)
-		if len(fields) <= keyIndex || len(fields) <= docIndex {
-			log.Fatalf("line has only %d fields", len(fields))
-		}
-		key = strings.TrimSpace(fields[keyIndex])
-		if *dropEmptyKeys && len(key) == 0 {
-			continue
-		}
-		doc = strings.TrimSpace(fields[docIndex])
-		if prev != key {
-			if err := writeBatch(bw, key, batch); err != nil {
-				log.Fatal(err)
-			}
-			batch = nil
-		}
-		prev = key
-		batch = append(batch, doc)
-	}
-	if err := writeBatch(bw, prev, batch); err != nil {
-		log.Fatal(err)
-	}
-}
-
-// containsBoth return true, if we have a ref and a non-ref item in the batch.
-func containsBoth(batch []string) bool {
-	var numRef int
-	for _, doc := range batch {
-		// This is brittle (but faster). Most JSON should be in compact form,
-		// and there the following chars are by convention added to distinguish
-		// a release coming from a reference doc from other releases.
-		if strings.Contains(doc, `"status":"ref"`) {
-			numRef++
-		}
-	}
-	return numRef > 0 && numRef < len(batch)
-}
-
-// writeBatch writes out a single line containing the key and the cluster values.
-func writeBatch(w io.Writer, key string, batch []string) (err error) {
-	if len(batch) == 0 {
-		return nil
-	}
-	if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
-		return nil
-	}
-	if *requireBoth && !containsBoth(batch) {
-		return nil
-	}
-	// This is brittle (and fast), but all items in a batch are valid JSON
-	// objects, hence, the following will be valid JSON as well, or will it?
-	// The key should not contain a quote.
-	_, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
-	return
-}
diff --git a/skate/map.go b/skate/map.go
index ef9c018..53aed1e 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -4,11 +4,13 @@ import (
 	"bytes"
 	"errors"
 	"reflect"
+	"regexp"
 	"runtime"
 	"strings"
 
 	"github.com/segmentio/encoding/json"
 	"github.com/tidwall/gjson"
+	"golang.org/x/text/unicode/norm"
 )
 
 var (
@@ -17,6 +19,55 @@ var (
 
 	ErrZeroFields       = errors.New("zero fields")
 	ErrMissingFieldName = errors.New("missing field name")
+
+	wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
+	repeatedWs = regexp.MustCompile(`[ ]{2,}`)
+	nonWord    = regexp.MustCompile(`[\W]+`)
+
+	SandcrawlerCharMap = map[string]string{
+		"\u00c6": "AE",
+		"\u00e6": "ae",
+		"\u00d0": "D",
+		"\u00f0": "d",
+		"\u00d8": "O",
+		"\u00f8": "o",
+		"\u00de": "Th",
+		"\u00fe": "th",
+		"\u00df": "s",
+		"\u0110": "D",
+		"\u0111": "d",
+		"\u0126": "H",
+		"\u0127": "h",
+		"\u0131": "i",
+		"\u0138": "k",
+		"\u0141": "L",
+		"\u0142": "l",
+		"\u014a": "N",
+		"\u014b": "n",
+		"\u0152": "Oe",
+		"\u0153": "oe",
+		"\u0166": "T",
+		"\u0167": "t",
+		"\u00b5": "u",
+		"c":      "c",
+		"\u0192": "f",
+		"\u2202": "",
+		"\u0296": "",
+		"\u2211": "",
+		"\u220f": "",
+		"\u02c6": "",
+		"\u2603": "",
+		"\u02c7": "",
+	}
+	SandcrawlerPrefixRemove = []string{
+		"original article: ", "original article ", "article: ", "title: ",
+	}
+	// SandcrawlerPrefixRemove does not have:
+	// InCombiningDiacriticalMarks (assume it's in "M"),
+	// https://unicodebook.readthedocs.io/unicode.html,
+	// https://stackoverflow.com/q/5697171/89391,
+	// https://github.com/google/re2/wiki/Syntax.
+	SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
 )
 
 // TitleDoc is a document with a title.
@@ -342,3 +393,23 @@ func MapperPartial(p []byte) (fields [][]byte, err error) {
 	// TODO: Group by some normlized container name or identifier.
 	return nil, nil
 }
+
+// sandcrawlerSlugify normalizes a string.
+func sandcrawlerSlugify(s string) string {
+	slug := strings.ToLower(strings.TrimSpace(s))
+	for _, prefix := range SandcrawlerPrefixRemove {
+		if strings.HasPrefix(slug, prefix) {
+			slug = slug[:len(prefix)]
+		}
+	}
+	slug = strings.ReplaceAll(slug, "&apos;", "'")
+	for k, v := range SandcrawlerCharMap {
+		slug = strings.ReplaceAll(slug, k, v)
+	}
+	if len(slug) == 0 {
+		return slug
+	}
+	slug = norm.NFKD.String(slug)
+	slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
+	return strings.ToLower(slug)
+}