aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-26 22:31:05 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-26 22:31:05 +0200
commit57826605209de687e0b6e6cb151021b7bcf034ca (patch)
tree1a3ec4a5891fdc6024438a910d064b44fb1eafbf /skate/cmd
parent5e082f6c0154887011c47f03c80c48352e8cdc77 (diff)
downloadrefcat-57826605209de687e0b6e6cb151021b7bcf034ca.tar.gz
refcat-57826605209de687e0b6e6cb151021b7bcf034ca.zip
cleanup obsolete command
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-derive-key/main.go96
1 files changed, 0 insertions, 96 deletions
diff --git a/skate/cmd/skate-derive-key/main.go b/skate/cmd/skate-derive-key/main.go
deleted file mode 100644
index 05e1261..0000000
--- a/skate/cmd/skate-derive-key/main.go
+++ /dev/null
@@ -1,96 +0,0 @@
-// skate-derive-key derives a key from release entity JSON documents.
-//
-// $ skate-derive-key < release_entities.jsonlines > docs.tsv
-//
-// Result will be a three column TSV (ident, key, doc), LC_ALL=C sorted by key.
-//
-// ---- ident --------------- ---- key ------------------------------ ---- doc ----------
-//
-// 4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsusceptibilityofh8n2o10sr {"abstracts":[],...
-//
-// After this step, a fast "itertools.groupby" or "skate-cluster" on key can yields clusters.
-//
-// Notes
-//
-// Using https://github.com/DataDog/zstd#stream-api, 3700 docs/s for key
-// extraction only; using zstd -T0, we get 21K docs/s; about 13K docs/s; about
-// 32h for 1.5B records.
-//
-// Default sort(1) buffer is 1K, but we'll need G's, e.g. -S35% of 48GB.
-package main
-
-import (
- "flag"
- "fmt"
- "log"
- "os"
- "runtime"
- "strings"
-
- "git.archive.org/martin/cgraph/skate"
- "git.archive.org/martin/cgraph/skate/parallel"
-)
-
-var (
- keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand")
- fixedField = flag.String("F", "", "extract string value from a fixed top level field, e.g. source_release_ident, ...")
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 50000, "batch size")
- verbose = flag.Bool("verbose", false, "show progress")
- bestEffort = flag.Bool("B", false, "best effort")
- logFile = flag.String("log", "", "log filename")
- skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs with empty keys")
-
- wsReplacer = strings.NewReplacer("\t", "", "\n", "")
- keyOpts = map[string]skate.IdentifierKeyFunc{
- "title": skate.KeyTitle,
- "tnorm": skate.KeyTitleNormalized,
- "tnysi": skate.KeyTitleNysiis,
- "tsand": skate.KeyTitleSandcrawler,
- }
- keyFunc skate.IdentifierKeyFunc
- ok bool
-)
-
-func main() {
- flag.Parse()
- if *fixedField != "" {
- // We want this, because from biblioref we wanted source_release_ident,
- // from refs release_ident, etc.
- keyFunc = skate.CreateFixedFieldFunc(*fixedField)
- } else {
- if keyFunc, ok = keyOpts[*keyFuncName]; !ok {
- log.Fatal("invalid key func")
- }
- }
- if *logFile != "" {
- f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
- if err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- log.SetOutput(f)
- }
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
- ident, key, err := keyFunc(p)
- if err != nil {
- if *bestEffort {
- log.Printf("keyFunc failed with %v: %v", err, string(p))
- return nil, nil
- }
- return nil, err
- }
- ident, key = strings.TrimSpace(ident), strings.TrimSpace(key)
- if *skipEmptyKeys && key == "" {
- return nil, nil
- }
- v := fmt.Sprintf("%s\t%s\t%s\n", ident, key, wsReplacer.Replace(string(p)))
- return []byte(v), nil
- })
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- pp.Verbose = *verbose
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
-}