From 09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sun, 21 Mar 2021 01:17:38 +0100 Subject: initial import of skate --- skate/cmd/skate-derive-key/main.go | 92 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 skate/cmd/skate-derive-key/main.go (limited to 'skate/cmd/skate-derive-key') diff --git a/skate/cmd/skate-derive-key/main.go b/skate/cmd/skate-derive-key/main.go new file mode 100644 index 0000000..2375a73 --- /dev/null +++ b/skate/cmd/skate-derive-key/main.go @@ -0,0 +1,92 @@ +// skate-derive-key derives a key from release entity JSON documents. +// +// $ skate-derive-key < release_entities.jsonlines > docs.tsv +// +// Result will be a three column TSV (ident, key, doc), LC_ALL=C sorted by key. +// +// ---- ident --------------- ---- key ------------------------------ ---- doc ---------- +// +// 4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsusceptibilityofh8n2o10sr {"abstracts":[],... +// +// After this step, a fast "itertools.groupby" or "skate-cluster" on key can yields clusters. +// +// Notes +// +// Using https://github.com/DataDog/zstd#stream-api, 3700 docs/s for key +// extraction only; using zstd -T0, we get 21K docs/s; about 13K docs/s; about +// 32h for 1.5B records. +// +// Default sort(1) buffer is 1K, but we'll need G's, e.g. -S35% of 48GB. +package main + +import ( + "flag" + "fmt" + "log" + "os" + "runtime" + "strings" + "time" + + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" +) + +var ( + keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand") + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 50000, "batch size") + verbose = flag.Bool("verbose", false, "show progress") + bestEffort = flag.Bool("B", false, "best effort") + logFile = flag.String("log", "", "log filename") + skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs without keys") + + wsReplacer = strings.NewReplacer("\t", "", "\n", "") + keyOpts = map[string]skate.IdentifierKeyFunc{ + "title": skate.KeyTitle, + "tnorm": skate.KeyTitleNormalized, + "tnysi": skate.KeyTitleNysiis, + "tsand": skate.KeyTitleSandcrawler, + } + keyFunc skate.IdentifierKeyFunc + ok bool +) + +func main() { + flag.Parse() + if keyFunc, ok = keyOpts[*keyFuncName]; !ok { + log.Fatal("invalid key func") + } + if *logFile != "" { + f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + log.Fatal(err) + } + defer f.Close() + log.SetOutput(f) + } + started := time.Now() + pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { + ident, key, err := keyFunc(p) + if err != nil { + if *bestEffort { + log.Printf("keyFunc failed with %v: %v", err, string(p)) + return nil, nil + } + return nil, err + } + ident, key = strings.TrimSpace(ident), strings.TrimSpace(key) + if *skipEmptyKeys && key == "" { + return nil, nil + } + v := fmt.Sprintf("%s\t%s\t%s\n", ident, key, wsReplacer.Replace(string(p))) + return []byte(v), nil + }) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + pp.Verbose = *verbose + if err := pp.Run(); err != nil { + log.Fatal(err) + } + log.Printf("took: %s", time.Since(started)) +} -- cgit v1.2.3