aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-derive-key/main.go
blob: df44b70939cfcca519250fc042ce7b5b2a6d38cb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
// skate-derive-key derives a key from release entity JSON documents.
//
// $ skate-derive-key < release_entities.jsonlines > docs.tsv
//
// Result will be a three column TSV (ident, key, doc), LC_ALL=C sorted by key.
//
// ---- ident ---------------    ---- key ------------------------------     ---- doc ----------
//
// 4lzgf5wzljcptlebhyobccj7ru    2568diamagneticsusceptibilityofh8n2o10sr    {"abstracts":[],...
//
// After this step, a fast "itertools.groupby" or "skate-cluster" on key can yields clusters.
//
// Notes
//
// Using https://github.com/DataDog/zstd#stream-api, 3700 docs/s for key
// extraction only; using zstd -T0, we get 21K docs/s; about 13K docs/s; about
// 32h for 1.5B records.
//
// Default sort(1) buffer is 1K, but we'll need G's, e.g. -S35% of 48GB.
package main

import (
	"flag"
	"fmt"
	"log"
	"os"
	"runtime"
	"strings"
	"time"

	"git.archive.org/martin/cgraph/skate"
	"git.archive.org/martin/cgraph/skate/parallel"
	json "github.com/segmentio/encoding/json"
)

var (
	keyFuncName   = flag.String("f", "tsand", "key function name, other: title, tnorm, tnysi, tsand")
	fixedField    = flag.String("F", "", "extract string value from a fixed top level field, e.g. source_release_ident, ...")
	numWorkers    = flag.Int("w", runtime.NumCPU(), "number of workers")
	batchSize     = flag.Int("b", 50000, "batch size")
	verbose       = flag.Bool("verbose", false, "show progress")
	bestEffort    = flag.Bool("B", false, "best effort")
	logFile       = flag.String("log", "", "log filename")
	skipEmptyKeys = flag.Bool("skip-empty-keys", false, "omit docs without keys")

	wsReplacer = strings.NewReplacer("\t", "", "\n", "")
	keyOpts    = map[string]skate.IdentifierKeyFunc{
		"title": skate.KeyTitle,
		"tnorm": skate.KeyTitleNormalized,
		"tnysi": skate.KeyTitleNysiis,
		"tsand": skate.KeyTitleSandcrawler,
	}
	keyFunc skate.IdentifierKeyFunc
	ok      bool
)

func main() {
	flag.Parse()
	if *fixedField != "" {
		// We want this, because from biblioref we wanted source_release_ident,
		// from refs release_ident, etc.
		keyFunc = func(p []byte) (id string, key string, err error) {
			var doc map[string]interface{}
			if err = json.Unmarshal(p, &doc); err != nil {
				return
			}
			v, ok := doc[*fixedField]
			if !ok {
				return "", "", nil
			}
			switch t := v.(type) {
			case string:
				return "", t, nil
			case int, int64, float32, float64:
				return "", fmt.Sprintf("%v", t), nil
			default:
				return "", "", nil
			}
		}
	} else {
		if keyFunc, ok = keyOpts[*keyFuncName]; !ok {
			log.Fatal("invalid key func")
		}
	}
	if *logFile != "" {
		f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
		if err != nil {
			log.Fatal(err)
		}
		defer f.Close()
		log.SetOutput(f)
	}
	if *verbose {
		started := time.Now()
		defer log.Printf("took: %s", time.Since(started))
	}
	pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
		ident, key, err := keyFunc(p)
		if err != nil {
			if *bestEffort {
				log.Printf("keyFunc failed with %v: %v", err, string(p))
				return nil, nil
			}
			return nil, err
		}
		ident, key = strings.TrimSpace(ident), strings.TrimSpace(key)
		if *skipEmptyKeys && key == "" {
			return nil, nil
		}
		v := fmt.Sprintf("%s\t%s\t%s\n", ident, key, wsReplacer.Replace(string(p)))
		return []byte(v), nil
	})
	pp.NumWorkers = *numWorkers
	pp.BatchSize = *batchSize
	pp.Verbose = *verbose
	if err := pp.Run(); err != nil {
		log.Fatal(err)
	}
}