// skate-derive-key derives a key from release entity JSON documents. // // $ skate-derive-key < release_entities.jsonlines > docs.tsv // // Result will be a three column TSV (ident, key, doc), LC_ALL=C sorted by key. // // ---- ident --------------- ---- key ------------------------------ ---- doc ---------- // // 4lzgf5wzljcptlebhyobccj7ru 2568diamagneticsusceptibilityofh8n2o10sr {"abstracts":[],... // // After this step, a fast "itertools.groupby" or "skate-cluster" on key can yields clusters. // // Notes // // Using https://github.com/DataDog/zstd#stream-api, 3700 docs/s for key // extraction only; using zstd -T0, we get 21K docs/s; about 13K docs/s; about // 32h for 1.5B records. // // Default sort(1) buffer is 1K, but we'll need G's, e.g. -S35% of 48GB. package main import ( "flag" "fmt" "log" "os" "runtime" "strings" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" json "github.com/segmentio/encoding/json" ) var ( keyFuncName = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand") fixedField = flag.String("F", "", "extract string value from a fixed top level field, e.g. source_release_ident, ...") numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 50000, "batch size") verbose = flag.Bool("verbose", false, "show progress") bestEffort = flag.Bool("B", false, "best effort") logFile = flag.String("log", "", "log filename") skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs without keys") wsReplacer = strings.NewReplacer("\t", "", "\n", "") keyOpts = map[string]skate.IdentifierKeyFunc{ "title": skate.KeyTitle, "tnorm": skate.KeyTitleNormalized, "tnysi": skate.KeyTitleNysiis, "tsand": skate.KeyTitleSandcrawler, } keyFunc skate.IdentifierKeyFunc ok bool ) func main() { flag.Parse() if *fixedField != "" { // We want this, because from biblioref we wanted source_release_ident, // from refs release_ident, etc. keyFunc = func(p []byte) (id string, key string, err error) { var doc map[string]interface{} if err = json.Unmarshal(p, &doc); err != nil { return } v, ok := doc[*fixedField] if !ok { return "", "", nil } switch t := v.(type) { case string: return "", t, nil case int, int64, float32, float64: return "", fmt.Sprintf("%v", t), nil default: return "", "", nil } } } else { if keyFunc, ok = keyOpts[*keyFuncName]; !ok { log.Fatal("invalid key func") } } if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) if err != nil { log.Fatal(err) } defer f.Close() log.SetOutput(f) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { ident, key, err := keyFunc(p) if err != nil { if *bestEffort { log.Printf("keyFunc failed with %v: %v", err, string(p)) return nil, nil } return nil, err } ident, key = strings.TrimSpace(ident), strings.TrimSpace(key) if *skipEmptyKeys && key == "" { return nil, nil } v := fmt.Sprintf("%s\t%s\t%s\n", ident, key, wsReplacer.Replace(string(p))) return []byte(v), nil }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize pp.Verbose = *verbose if err := pp.Run(); err != nil { log.Fatal(err) } }