diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-17 16:02:26 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-17 16:02:26 +0200 |
commit | 03a2119d0b6f0a7a76194c1aaa074b218bc90a2f (patch) | |
tree | 2d215467d958a46b1460be40723e3659e17d0c17 | |
parent | 279238e1e55abd37c5f8605e9494d2810ff1b34c (diff) | |
download | refcat-03a2119d0b6f0a7a76194c1aaa074b218bc90a2f.tar.gz refcat-03a2119d0b6f0a7a76194c1aaa074b218bc90a2f.zip |
cleanup another script
-rw-r--r-- | skate/Makefile | 2 | ||||
-rw-r--r-- | skate/cluster.go | 155 | ||||
-rw-r--r-- | skate/cluster_test.go | 43 | ||||
-rw-r--r-- | skate/cmd/skate-cluster/main.go | 112 | ||||
-rw-r--r-- | skate/map.go | 71 |
5 files changed, 72 insertions, 311 deletions
diff --git a/skate/Makefile b/skate/Makefile index 35cf2a9..8092cbe 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-conv skate-cluster skate-cleanup skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce skate-cdx-lookup skate-resolve-journal-name +TARGETS := skate-conv skate-cleanup skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce skate-cdx-lookup skate-resolve-journal-name PKGNAME := skate .PHONY: test diff --git a/skate/cluster.go b/skate/cluster.go deleted file mode 100644 index 9e16b78..0000000 --- a/skate/cluster.go +++ /dev/null @@ -1,155 +0,0 @@ -package skate - -import ( - "fmt" - "regexp" - "strings" - - "github.com/segmentio/encoding/json" - "golang.org/x/text/unicode/norm" -) - -// IdentifierKeyFunc returns the id and some key from a given blob. -type IdentifierKeyFunc func([]byte) (string, string, error) - -var ( - wsReplacer = strings.NewReplacer("\t", " ", "\n", " ") - repeatedWs = regexp.MustCompile(`[ ]{2,}`) - nonWord = regexp.MustCompile(`[\W]+`) - - SandcrawlerCharMap = map[string]string{ - "\u00c6": "AE", - "\u00e6": "ae", - "\u00d0": "D", - "\u00f0": "d", - "\u00d8": "O", - "\u00f8": "o", - "\u00de": "Th", - "\u00fe": "th", - "\u00df": "s", - "\u0110": "D", - "\u0111": "d", - "\u0126": "H", - "\u0127": "h", - "\u0131": "i", - "\u0138": "k", - "\u0141": "L", - "\u0142": "l", - "\u014a": "N", - "\u014b": "n", - "\u0152": "Oe", - "\u0153": "oe", - "\u0166": "T", - "\u0167": "t", - "\u00b5": "u", - "c": "c", - "\u0192": "f", - "\u2202": "", - "\u0296": "", - "\u2211": "", - "\u220f": "", - "\u02c6": "", - "\u2603": "", - "\u02c7": "", - } - SandcrawlerPrefixRemove = []string{ - "original article: ", "original article ", "article: ", "title: ", - } - // SandcrawlerPrefixRemove does not have: - // InCombiningDiacriticalMarks (assume it's in "M"), - // https://unicodebook.readthedocs.io/unicode.html, - // https://stackoverflow.com/q/5697171/89391, - // https://github.com/google/re2/wiki/Syntax. - SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]") -) - -// IdentTitleDoc is a minimal subset of fields, we can work with. -type IdentTitleDoc struct { - Ident string `json:"ident"` - Title string `json:"title"` -} - -// KeyTitle is extract the title, and slight cleaning. -func KeyTitle(p []byte) (ident string, key string, err error) { - var doc IdentTitleDoc - if err = json.Unmarshal(p, &doc); err != nil { - return ident, key, err - } - title := wsReplacer.Replace(strings.TrimSpace(doc.Title)) - return doc.Ident, title, nil -} - -// KeyTitleNormalized applies further normalization. -func KeyTitleNormalized(p []byte) (ident string, key string, err error) { - ident, key, err = KeyTitle(p) - if err != nil { - return - } - key = strings.ToLower(key) - key = repeatedWs.ReplaceAllString(key, " ") - key = nonWord.ReplaceAllString(key, "") - return ident, key, nil -} - -// KeyTitleNysiis returns the New York State Identification and Intelligence -// System phonetic code for the title. -func KeyTitleNysiis(p []byte) (ident string, key string, err error) { - ident, key, err = KeyTitle(p) - if err != nil { - return - } - return ident, NYSIIS(key), nil -} - -// KeyTitleSandcrawler applies more sophisticated title cleanup. -func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) { - ident, key, err = KeyTitle(p) - if err != nil { - return - } - return ident, sandcrawlerSlugify(key), nil -} - -// CreateFixedFieldFunc creates an extractor function given a json path. -// Currently only top level key is supported. -func CreateFixedFieldFunc(path string) IdentifierKeyFunc { - f := func(p []byte) (ident string, key string, err error) { - var doc map[string]interface{} - if err = json.Unmarshal(p, &doc); err != nil { - return - } - v, ok := doc[path] - if !ok { - return "", "", nil - } - switch t := v.(type) { - case string: - return "", t, nil - case int, int64, float32, float64: - return "", fmt.Sprintf("%v", t), nil - default: - return "", "", nil - } - } - return f -} - -// sandcrawlerSlugify normalizes a string. -func sandcrawlerSlugify(s string) string { - slug := strings.ToLower(strings.TrimSpace(s)) - for _, prefix := range SandcrawlerPrefixRemove { - if strings.HasPrefix(slug, prefix) { - slug = slug[:len(prefix)] - } - } - slug = strings.ReplaceAll(slug, "'", "'") - for k, v := range SandcrawlerCharMap { - slug = strings.ReplaceAll(slug, k, v) - } - if len(slug) == 0 { - return slug - } - slug = norm.NFKD.String(slug) - slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "") - return strings.ToLower(slug) -} diff --git a/skate/cluster_test.go b/skate/cluster_test.go deleted file mode 100644 index 1c8819e..0000000 --- a/skate/cluster_test.go +++ /dev/null @@ -1,43 +0,0 @@ -package skate - -import "testing" - -func TestKeyTitleSandcrawler(t *testing.T) { - var cases = []struct { - b []byte - ident string - key string - err error - }{ - { - []byte(`{"ident": "123", "title": "abc"}`), - "123", - "abc", - nil, - }, - { - []byte(`{"ident": "123", "title": "abc++***##???ßßß"}`), - "123", - "abcsss", - nil, - }, - { - []byte(`{"ident": "123", "title": "A k"}`), - "123", - "ak", - nil, - }, - } - for _, c := range cases { - ident, key, err := KeyTitleSandcrawler(c.b) - if key != c.key { - t.Errorf("[key] got %v, want %v", key, c.key) - } - if ident != c.ident { - t.Errorf("[ident] got %v, want %v", ident, c.ident) - } - if err != c.err { - t.Errorf("[err] got %v, want %v", err, c.err) - } - } -} diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go deleted file mode 100644 index de11de1..0000000 --- a/skate/cmd/skate-cluster/main.go +++ /dev/null @@ -1,112 +0,0 @@ -// skate-cluster takes the (tab) output of skate-map (plus sort) and generates -// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g. -// require refs and release docs in a single cluster). -// -// For example, this: -// -// id123 somekey123 {"a":"b", ...} -// id391 somekey123 {"x":"y", ...} -// -// would turn into (a single line containing all docs with the same key). -// -// {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]} -// -// A single line cluster is easier to parallelize (e.g. for verification, etc.). -package main - -import ( - "bufio" - "flag" - "fmt" - "io" - "log" - "os" - "strings" -) - -var ( - keyField = flag.Int("k", 2, "which column contains the key (one based)") - docField = flag.Int("d", 3, "which column contains the doc (one based)") - minClusterSize = flag.Int("min", 2, "minimum cluster size") - maxClusterSize = flag.Int("max", 100000, "maximum cluster size") - requireBoth = flag.Bool("both", false, - "require at least one ref and one non-ref item present in the cluster, implies -min 2") - dropEmptyKeys = flag.Bool("D", false, "drop empty keys") - delimiter = flag.String("d", "\t", "field delimiter") -) - -func main() { - flag.Parse() - var ( - br = bufio.NewReader(os.Stdin) - bw = bufio.NewWriter(os.Stdout) - prev, key, doc string - batch, fields []string - keyIndex = *keyField - 1 - docIndex = *docField - 1 - line string - err error - ) - defer bw.Flush() - for { - line, err = br.ReadString('\n') - if err == io.EOF { - break - } - if err != nil { - log.Fatal(err) - } - fields = strings.Split(line, *delimiter) - if len(fields) <= keyIndex || len(fields) <= docIndex { - log.Fatalf("line has only %d fields", len(fields)) - } - key = strings.TrimSpace(fields[keyIndex]) - if *dropEmptyKeys && len(key) == 0 { - continue - } - doc = strings.TrimSpace(fields[docIndex]) - if prev != key { - if err := writeBatch(bw, key, batch); err != nil { - log.Fatal(err) - } - batch = nil - } - prev = key - batch = append(batch, doc) - } - if err := writeBatch(bw, prev, batch); err != nil { - log.Fatal(err) - } -} - -// containsBoth return true, if we have a ref and a non-ref item in the batch. -func containsBoth(batch []string) bool { - var numRef int - for _, doc := range batch { - // This is brittle (but faster). Most JSON should be in compact form, - // and there the following chars are by convention added to distinguish - // a release coming from a reference doc from other releases. - if strings.Contains(doc, `"status":"ref"`) { - numRef++ - } - } - return numRef > 0 && numRef < len(batch) -} - -// writeBatch writes out a single line containing the key and the cluster values. -func writeBatch(w io.Writer, key string, batch []string) (err error) { - if len(batch) == 0 { - return nil - } - if len(batch) < *minClusterSize || len(batch) > *maxClusterSize { - return nil - } - if *requireBoth && !containsBoth(batch) { - return nil - } - // This is brittle (and fast), but all items in a batch are valid JSON - // objects, hence, the following will be valid JSON as well, or will it? - // The key should not contain a quote. - _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ",")) - return -} diff --git a/skate/map.go b/skate/map.go index ef9c018..53aed1e 100644 --- a/skate/map.go +++ b/skate/map.go @@ -4,11 +4,13 @@ import ( "bytes" "errors" "reflect" + "regexp" "runtime" "strings" "github.com/segmentio/encoding/json" "github.com/tidwall/gjson" + "golang.org/x/text/unicode/norm" ) var ( @@ -17,6 +19,55 @@ var ( ErrZeroFields = errors.New("zero fields") ErrMissingFieldName = errors.New("missing field name") + + wsReplacer = strings.NewReplacer("\t", " ", "\n", " ") + repeatedWs = regexp.MustCompile(`[ ]{2,}`) + nonWord = regexp.MustCompile(`[\W]+`) + + SandcrawlerCharMap = map[string]string{ + "\u00c6": "AE", + "\u00e6": "ae", + "\u00d0": "D", + "\u00f0": "d", + "\u00d8": "O", + "\u00f8": "o", + "\u00de": "Th", + "\u00fe": "th", + "\u00df": "s", + "\u0110": "D", + "\u0111": "d", + "\u0126": "H", + "\u0127": "h", + "\u0131": "i", + "\u0138": "k", + "\u0141": "L", + "\u0142": "l", + "\u014a": "N", + "\u014b": "n", + "\u0152": "Oe", + "\u0153": "oe", + "\u0166": "T", + "\u0167": "t", + "\u00b5": "u", + "c": "c", + "\u0192": "f", + "\u2202": "", + "\u0296": "", + "\u2211": "", + "\u220f": "", + "\u02c6": "", + "\u2603": "", + "\u02c7": "", + } + SandcrawlerPrefixRemove = []string{ + "original article: ", "original article ", "article: ", "title: ", + } + // SandcrawlerPrefixRemove does not have: + // InCombiningDiacriticalMarks (assume it's in "M"), + // https://unicodebook.readthedocs.io/unicode.html, + // https://stackoverflow.com/q/5697171/89391, + // https://github.com/google/re2/wiki/Syntax. + SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]") ) // TitleDoc is a document with a title. @@ -342,3 +393,23 @@ func MapperPartial(p []byte) (fields [][]byte, err error) { // TODO: Group by some normlized container name or identifier. return nil, nil } + +// sandcrawlerSlugify normalizes a string. +func sandcrawlerSlugify(s string) string { + slug := strings.ToLower(strings.TrimSpace(s)) + for _, prefix := range SandcrawlerPrefixRemove { + if strings.HasPrefix(slug, prefix) { + slug = slug[:len(prefix)] + } + } + slug = strings.ReplaceAll(slug, "'", "'") + for k, v := range SandcrawlerCharMap { + slug = strings.ReplaceAll(slug, k, v) + } + if len(slug) == 0 { + return slug + } + slug = norm.NFKD.String(slug) + slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "") + return strings.ToLower(slug) +} |