diff options
-rw-r--r-- | skate/cluster.go | 25 | ||||
-rw-r--r-- | skate/cmd/skate-derive-key/main.go | 22 |
2 files changed, 27 insertions, 20 deletions
diff --git a/skate/cluster.go b/skate/cluster.go index 7fc4e1b..3421a0b 100644 --- a/skate/cluster.go +++ b/skate/cluster.go @@ -1,6 +1,7 @@ package skate import ( + "fmt" "regexp" "strings" @@ -109,6 +110,30 @@ func KeyTitleSandcrawler(p []byte) (ident string, key string, err error) { return ident, sandcrawlerSlugify(key), nil } +// CreateFixedFieldFunc creates an extractor function given a json path. +// Currently only top level key is supported. +func CreateFixedFieldFunc(path string) IdentifierKeyFunc { + f := func(p []byte) (ident string, key string, err error) { + var doc map[string]interface{} + if err = json.Unmarshal(p, &doc); err != nil { + return + } + v, ok := doc[path] + if !ok { + return "", "", nil + } + switch t := v.(type) { + case string: + return "", t, nil + case int, int64, float32, float64: + return "", fmt.Sprintf("%v", t), nil + default: + return "", "", nil + } + } + return f +} + // sandcrawlerSlugify normalizes a string. func sandcrawlerSlugify(s string) string { slug := strings.ToLower(strings.TrimSpace(s)) diff --git a/skate/cmd/skate-derive-key/main.go b/skate/cmd/skate-derive-key/main.go index 61c79e1..05e1261 100644 --- a/skate/cmd/skate-derive-key/main.go +++ b/skate/cmd/skate-derive-key/main.go @@ -29,7 +29,6 @@ import ( "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" - json "github.com/segmentio/encoding/json" ) var ( @@ -40,7 +39,7 @@ var ( verbose = flag.Bool("verbose", false, "show progress") bestEffort = flag.Bool("B", false, "best effort") logFile = flag.String("log", "", "log filename") - skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs without keys") + skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs with empty keys") wsReplacer = strings.NewReplacer("\t", "", "\n", "") keyOpts = map[string]skate.IdentifierKeyFunc{ @@ -58,24 +57,7 @@ func main() { if *fixedField != "" { // We want this, because from biblioref we wanted source_release_ident, // from refs release_ident, etc. - keyFunc = func(p []byte) (id string, key string, err error) { - var doc map[string]interface{} - if err = json.Unmarshal(p, &doc); err != nil { - return - } - v, ok := doc[*fixedField] - if !ok { - return "", "", nil - } - switch t := v.(type) { - case string: - return "", t, nil - case int, int64, float32, float64: - return "", fmt.Sprintf("%v", t), nil - default: - return "", "", nil - } - } + keyFunc = skate.CreateFixedFieldFunc(*fixedField) } else { if keyFunc, ok = keyOpts[*keyFuncName]; !ok { log.Fatal("invalid key func") |