diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-26 22:31:05 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-26 22:31:05 +0200 |
commit | 57826605209de687e0b6e6cb151021b7bcf034ca (patch) | |
tree | 1a3ec4a5891fdc6024438a910d064b44fb1eafbf /skate/cmd | |
parent | 5e082f6c0154887011c47f03c80c48352e8cdc77 (diff) | |
download | refcat-57826605209de687e0b6e6cb151021b7bcf034ca.tar.gz refcat-57826605209de687e0b6e6cb151021b7bcf034ca.zip |
cleanup obsolete command
Diffstat (limited to 'skate/cmd')
-rw-r--r-- | skate/cmd/skate-derive-key/main.go | 96 |
1 files changed, 0 insertions, 96 deletions
diff --git a/skate/cmd/skate-derive-key/main.go b/skate/cmd/skate-derive-key/main.go deleted file mode 100644 index 05e1261..0000000 --- a/skate/cmd/skate-derive-key/main.go +++ /dev/null @@ -1,96 +0,0 @@ -// skate-derive-key derives a key from release entity JSON documents. -// -// $ skate-derive-key < release_entities.jsonlines > docs.tsv -// -// Result will be a three column TSV (ident, key, doc), LC_ALL=C sorted by key. -// -// ---- ident --------------- ---- key ------------------------------ ---- doc ---------- -// -// 4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsusceptibilityofh8n2o10sr {"abstracts":[],... -// -// After this step, a fast "itertools.groupby" or "skate-cluster" on key can yields clusters. -// -// Notes -// -// Using https://github.com/DataDog/zstd#stream-api, 3700 docs/s for key -// extraction only; using zstd -T0, we get 21K docs/s; about 13K docs/s; about -// 32h for 1.5B records. -// -// Default sort(1) buffer is 1K, but we'll need G's, e.g. -S35% of 48GB. -package main - -import ( - "flag" - "fmt" - "log" - "os" - "runtime" - "strings" - - "git.archive.org/martin/cgraph/skate" - "git.archive.org/martin/cgraph/skate/parallel" -) - -var ( - keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand") - fixedField = flag.String("F", "", "extract string value from a fixed top level field, e.g. source_release_ident, ...") - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 50000, "batch size") - verbose = flag.Bool("verbose", false, "show progress") - bestEffort = flag.Bool("B", false, "best effort") - logFile = flag.String("log", "", "log filename") - skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs with empty keys") - - wsReplacer = strings.NewReplacer("\t", "", "\n", "") - keyOpts = map[string]skate.IdentifierKeyFunc{ - "title": skate.KeyTitle, - "tnorm": skate.KeyTitleNormalized, - "tnysi": skate.KeyTitleNysiis, - "tsand": skate.KeyTitleSandcrawler, - } - keyFunc skate.IdentifierKeyFunc - ok bool -) - -func main() { - flag.Parse() - if *fixedField != "" { - // We want this, because from biblioref we wanted source_release_ident, - // from refs release_ident, etc. - keyFunc = skate.CreateFixedFieldFunc(*fixedField) - } else { - if keyFunc, ok = keyOpts[*keyFuncName]; !ok { - log.Fatal("invalid key func") - } - } - if *logFile != "" { - f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) - if err != nil { - log.Fatal(err) - } - defer f.Close() - log.SetOutput(f) - } - pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { - ident, key, err := keyFunc(p) - if err != nil { - if *bestEffort { - log.Printf("keyFunc failed with %v: %v", err, string(p)) - return nil, nil - } - return nil, err - } - ident, key = strings.TrimSpace(ident), strings.TrimSpace(key) - if *skipEmptyKeys && key == "" { - return nil, nil - } - v := fmt.Sprintf("%s\t%s\t%s\n", ident, key, wsReplacer.Replace(string(p))) - return []byte(v), nil - }) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - pp.Verbose = *verbose - if err := pp.Run(); err != nil { - log.Fatal(err) - } -} |