aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-17 16:02:26 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-17 16:02:26 +0200
commit03a2119d0b6f0a7a76194c1aaa074b218bc90a2f (patch)
tree2d215467d958a46b1460be40723e3659e17d0c17
parent279238e1e55abd37c5f8605e9494d2810ff1b34c (diff)
downloadrefcat-03a2119d0b6f0a7a76194c1aaa074b218bc90a2f.tar.gz
refcat-03a2119d0b6f0a7a76194c1aaa074b218bc90a2f.zip
cleanup another script
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cluster.go155
-rw-r--r--skate/cluster_test.go43
-rw-r--r--skate/cmd/skate-cluster/main.go112
-rw-r--r--skate/map.go71
5 files changed, 72 insertions, 311 deletions
diff --git a/skate/Makefile b/skate/Makefile
index 35cf2a9..8092cbe 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-conv skate-cluster skate-cleanup skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce skate-cdx-lookup skate-resolve-journal-name
+TARGETS := skate-conv skate-cleanup skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce skate-cdx-lookup skate-resolve-journal-name
PKGNAME := skate
.PHONY: test
diff --git a/skate/cluster.go b/skate/cluster.go
deleted file mode 100644
index 9e16b78..0000000
--- a/skate/cluster.go
+++ /dev/null
@@ -1,155 +0,0 @@
-package skate
-
-import (
- "fmt"
- "regexp"
- "strings"
-
- "github.com/segmentio/encoding/json"
- "golang.org/x/text/unicode/norm"
-)
-
-// IdentifierKeyFunc returns the id and some key from a given blob.
-type IdentifierKeyFunc func([]byte) (string, string, error)
-
-var (
- wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
- repeatedWs = regexp.MustCompile(`[ ]{2,}`)
- nonWord = regexp.MustCompile(`[\W]+`)
-
- SandcrawlerCharMap = map[string]string{
- "\u00c6": "AE",
- "\u00e6": "ae",
- "\u00d0": "D",
- "\u00f0": "d",
- "\u00d8": "O",
- "\u00f8": "o",
- "\u00de": "Th",
- "\u00fe": "th",
- "\u00df": "s",
- "\u0110": "D",
- "\u0111": "d",
- "\u0126": "H",
- "\u0127": "h",
- "\u0131": "i",
- "\u0138": "k",
- "\u0141": "L",
- "\u0142": "l",
- "\u014a": "N",
- "\u014b": "n",
- "\u0152": "Oe",
- "\u0153": "oe",
- "\u0166": "T",
- "\u0167": "t",
- "\u00b5": "u",
- "c": "c",
- "\u0192": "f",
- "\u2202": "",
- "\u0296": "",
- "\u2211": "",
- "\u220f": "",
- "\u02c6": "",
- "\u2603": "",
- "\u02c7": "",
- }
- SandcrawlerPrefixRemove = []string{
- "original article: ", "original article ", "article: ", "title: ",
- }
- // SandcrawlerPrefixRemove does not have:
- // InCombiningDiacriticalMarks (assume it's in "M"),
- // https://unicodebook.readthedocs.io/unicode.html,
- // https://stackoverflow.com/q/5697171/89391,
- // https://github.com/google/re2/wiki/Syntax.
- SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
-)
-
-// IdentTitleDoc is a minimal subset of fields, we can work with.
-type IdentTitleDoc struct {
- Ident string `json:"ident"`
- Title string `json:"title"`
-}
-
-// KeyTitle is extract the title, and slight cleaning.
-func KeyTitle(p []byte) (ident string, key string, err error) {
- var doc IdentTitleDoc
- if err = json.Unmarshal(p, &doc); err != nil {
- return ident, key, err
- }
- title := wsReplacer.Replace(strings.TrimSpace(doc.Title))
- return doc.Ident, title, nil
-}
-
-// KeyTitleNormalized applies further normalization.
-func KeyTitleNormalized(p []byte) (ident string, key string, err error) {
- ident, key, err = KeyTitle(p)
- if err != nil {
- return
- }
- key = strings.ToLower(key)
- key = repeatedWs.ReplaceAllString(key, " ")
- key = nonWord.ReplaceAllString(key, "")
- return ident, key, nil
-}
-
-// KeyTitleNysiis returns the New York State Identification and Intelligence
-// System phonetic code for the title.
-func KeyTitleNysiis(p []byte) (ident string, key string, err error) {
- ident, key, err = KeyTitle(p)
- if err != nil {
- return
- }
- return ident, NYSIIS(key), nil
-}
-
-// KeyTitleSandcrawler applies more sophisticated title cleanup.
-func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) {
- ident, key, err = KeyTitle(p)
- if err != nil {
- return
- }
- return ident, sandcrawlerSlugify(key), nil
-}
-
-// CreateFixedFieldFunc creates an extractor function given a json path.
-// Currently only top level key is supported.
-func CreateFixedFieldFunc(path string) IdentifierKeyFunc {
- f := func(p []byte) (ident string, key string, err error) {
- var doc map[string]interface{}
- if err = json.Unmarshal(p, &doc); err != nil {
- return
- }
- v, ok := doc[path]
- if !ok {
- return "", "", nil
- }
- switch t := v.(type) {
- case string:
- return "", t, nil
- case int, int64, float32, float64:
- return "", fmt.Sprintf("%v", t), nil
- default:
- return "", "", nil
- }
- }
- return f
-}
-
-// sandcrawlerSlugify normalizes a string.
-func sandcrawlerSlugify(s string) string {
- slug := strings.ToLower(strings.TrimSpace(s))
- for _, prefix := range SandcrawlerPrefixRemove {
- if strings.HasPrefix(slug, prefix) {
- slug = slug[:len(prefix)]
- }
- }
- slug = strings.ReplaceAll(slug, "&apos;", "'")
- for k, v := range SandcrawlerCharMap {
- slug = strings.ReplaceAll(slug, k, v)
- }
- if len(slug) == 0 {
- return slug
- }
- slug = norm.NFKD.String(slug)
- slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
- return strings.ToLower(slug)
-}
diff --git a/skate/cluster_test.go b/skate/cluster_test.go
deleted file mode 100644
index 1c8819e..0000000
--- a/skate/cluster_test.go
+++ /dev/null
@@ -1,43 +0,0 @@
-package skate
-
-import "testing"
-
-func TestKeyTitleSandcrawler(t *testing.T) {
- var cases = []struct {
- b []byte
- ident string
- key string
- err error
- }{
- {
- []byte(`{"ident": "123", "title": "abc"}`),
- "123",
- "abc",
- nil,
- },
- {
- []byte(`{"ident": "123", "title": "abc++***##???ßßß"}`),
- "123",
- "abcsss",
- nil,
- },
- {
- []byte(`{"ident": "123", "title": "A k"}`),
- "123",
- "ak",
- nil,
- },
- }
- for _, c := range cases {
- ident, key, err := KeyTitleSandcrawler(c.b)
- if key != c.key {
- t.Errorf("[key] got %v, want %v", key, c.key)
- }
- if ident != c.ident {
- t.Errorf("[ident] got %v, want %v", ident, c.ident)
- }
- if err != c.err {
- t.Errorf("[err] got %v, want %v", err, c.err)
- }
- }
-}
diff --git a/skate/cmd/skate-cluster/main.go b/skate/cmd/skate-cluster/main.go
deleted file mode 100644
index de11de1..0000000
--- a/skate/cmd/skate-cluster/main.go
+++ /dev/null
@@ -1,112 +0,0 @@
-// skate-cluster takes the (tab) output of skate-map (plus sort) and generates
-// a "cluster" document, grouping docs by key. Can do some pre-filtering (e.g.
-// require refs and release docs in a single cluster).
-//
-// For example, this:
-//
-// id123 somekey123 {"a":"b", ...}
-// id391 somekey123 {"x":"y", ...}
-//
-// would turn into (a single line containing all docs with the same key).
-//
-// {"k": "somekey123", "v": [{"a":"b", ...},{"x":"y",...}]}
-//
-// A single line cluster is easier to parallelize (e.g. for verification, etc.).
-package main
-
-import (
- "bufio"
- "flag"
- "fmt"
- "io"
- "log"
- "os"
- "strings"
-)
-
-var (
- keyField = flag.Int("k", 2, "which column contains the key (one based)")
- docField = flag.Int("d", 3, "which column contains the doc (one based)")
- minClusterSize = flag.Int("min", 2, "minimum cluster size")
- maxClusterSize = flag.Int("max", 100000, "maximum cluster size")
- requireBoth = flag.Bool("both", false,
- "require at least one ref and one non-ref item present in the cluster, implies -min 2")
- dropEmptyKeys = flag.Bool("D", false, "drop empty keys")
- delimiter = flag.String("d", "\t", "field delimiter")
-)
-
-func main() {
- flag.Parse()
- var (
- br = bufio.NewReader(os.Stdin)
- bw = bufio.NewWriter(os.Stdout)
- prev, key, doc string
- batch, fields []string
- keyIndex = *keyField - 1
- docIndex = *docField - 1
- line string
- err error
- )
- defer bw.Flush()
- for {
- line, err = br.ReadString('\n')
- if err == io.EOF {
- break
- }
- if err != nil {
- log.Fatal(err)
- }
- fields = strings.Split(line, *delimiter)
- if len(fields) <= keyIndex || len(fields) <= docIndex {
- log.Fatalf("line has only %d fields", len(fields))
- }
- key = strings.TrimSpace(fields[keyIndex])
- if *dropEmptyKeys && len(key) == 0 {
- continue
- }
- doc = strings.TrimSpace(fields[docIndex])
- if prev != key {
- if err := writeBatch(bw, key, batch); err != nil {
- log.Fatal(err)
- }
- batch = nil
- }
- prev = key
- batch = append(batch, doc)
- }
- if err := writeBatch(bw, prev, batch); err != nil {
- log.Fatal(err)
- }
-}
-
-// containsBoth return true, if we have a ref and a non-ref item in the batch.
-func containsBoth(batch []string) bool {
- var numRef int
- for _, doc := range batch {
- // This is brittle (but faster). Most JSON should be in compact form,
- // and there the following chars are by convention added to distinguish
- // a release coming from a reference doc from other releases.
- if strings.Contains(doc, `"status":"ref"`) {
- numRef++
- }
- }
- return numRef > 0 && numRef < len(batch)
-}
-
-// writeBatch writes out a single line containing the key and the cluster values.
-func writeBatch(w io.Writer, key string, batch []string) (err error) {
- if len(batch) == 0 {
- return nil
- }
- if len(batch) < *minClusterSize || len(batch) > *maxClusterSize {
- return nil
- }
- if *requireBoth && !containsBoth(batch) {
- return nil
- }
- // This is brittle (and fast), but all items in a batch are valid JSON
- // objects, hence, the following will be valid JSON as well, or will it?
- // The key should not contain a quote.
- _, err = fmt.Fprintf(w, "{\"k\": \"%s\", \"v\": [%s]}\n", key, strings.Join(batch, ","))
- return
-}
diff --git a/skate/map.go b/skate/map.go
index ef9c018..53aed1e 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -4,11 +4,13 @@ import (
"bytes"
"errors"
"reflect"
+ "regexp"
"runtime"
"strings"
"github.com/segmentio/encoding/json"
"github.com/tidwall/gjson"
+ "golang.org/x/text/unicode/norm"
)
var (
@@ -17,6 +19,55 @@ var (
ErrZeroFields = errors.New("zero fields")
ErrMissingFieldName = errors.New("missing field name")
+
+ wsReplacer = strings.NewReplacer("\t", " ", "\n", " ")
+ repeatedWs = regexp.MustCompile(`[ ]{2,}`)
+ nonWord = regexp.MustCompile(`[\W]+`)
+
+ SandcrawlerCharMap = map[string]string{
+ "\u00c6": "AE",
+ "\u00e6": "ae",
+ "\u00d0": "D",
+ "\u00f0": "d",
+ "\u00d8": "O",
+ "\u00f8": "o",
+ "\u00de": "Th",
+ "\u00fe": "th",
+ "\u00df": "s",
+ "\u0110": "D",
+ "\u0111": "d",
+ "\u0126": "H",
+ "\u0127": "h",
+ "\u0131": "i",
+ "\u0138": "k",
+ "\u0141": "L",
+ "\u0142": "l",
+ "\u014a": "N",
+ "\u014b": "n",
+ "\u0152": "Oe",
+ "\u0153": "oe",
+ "\u0166": "T",
+ "\u0167": "t",
+ "\u00b5": "u",
+ "c": "c",
+ "\u0192": "f",
+ "\u2202": "",
+ "\u0296": "",
+ "\u2211": "",
+ "\u220f": "",
+ "\u02c6": "",
+ "\u2603": "",
+ "\u02c7": "",
+ }
+ SandcrawlerPrefixRemove = []string{
+ "original article: ", "original article ", "article: ", "title: ",
+ }
+ // SandcrawlerPrefixRemove does not have:
+ // InCombiningDiacriticalMarks (assume it's in "M"),
+ // https://unicodebook.readthedocs.io/unicode.html,
+ // https://stackoverflow.com/q/5697171/89391,
+ // https://github.com/google/re2/wiki/Syntax.
+ SandcrawlerRemoveCharRegex = regexp.MustCompile("[\\s\\p{P}\\p{M}\u2000-\u206F\u2E00-\u2E7F’\u0060·“”‘’“”«»「」¿–±§_°ʖ©®¤=<>|+$^~≈√∫≤≥÷ƒ∆¬£¢∞¥◊€]")
)
// TitleDoc is a document with a title.
@@ -342,3 +393,23 @@ func MapperPartial(p []byte) (fields [][]byte, err error) {
// TODO: Group by some normlized container name or identifier.
return nil, nil
}
+
+// sandcrawlerSlugify normalizes a string.
+func sandcrawlerSlugify(s string) string {
+ slug := strings.ToLower(strings.TrimSpace(s))
+ for _, prefix := range SandcrawlerPrefixRemove {
+ if strings.HasPrefix(slug, prefix) {
+ slug = slug[:len(prefix)]
+ }
+ }
+ slug = strings.ReplaceAll(slug, "&apos;", "'")
+ for k, v := range SandcrawlerCharMap {
+ slug = strings.ReplaceAll(slug, k, v)
+ }
+ if len(slug) == 0 {
+ return slug
+ }
+ slug = norm.NFKD.String(slug)
+ slug = SandcrawlerRemoveCharRegex.ReplaceAllString(slug, "")
+ return strings.ToLower(slug)
+}