1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
// skate-resolve-journal-name works on TSV where one column (-f, 1-indexed) is
// a release schema. It will inspect the container name and will resolve
// journal name abbreviations and will augment the release document with data
// in extra containing the resolved name (e.g. in
// extra.skate.resolved_container_name).
//
// The abbreviation file format currently is a JSONL format, each document
// containing a "name" and "abbrev" (optional).
package main
import (
"bufio"
"bytes"
"encoding/json"
"flag"
"fmt"
"io"
"log"
"os"
"runtime"
"strings"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
"git.archive.org/martin/cgraph/skate/set"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
column = flag.Int("f", 2, "column to find the release schema document in (1-indexed)")
sep = flag.String("d", "\t", "delimiter to use")
bestEffort = flag.Bool("B", false, "only log errors, do not halt")
abbrevFile = flag.String("A", "", "path to abbreviate file")
keepResolvedOnly = flag.Bool("R", false, "keep only lines, where we could resolve an abbreviation")
)
func main() {
flag.Parse()
ms, m, err := createAbbreviationMatchSet(*abbrevFile)
if err != nil {
log.Fatal(err)
}
log.Printf("found %d abbreviation mappings", len(m))
pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
var (
release *skate.Release
err error
)
fields := bytes.Split(p, []byte(*sep))
if *column > len(fields) {
return nil, fmt.Errorf("column mismatch")
}
if err = json.Unmarshal(fields[*column-1], &release); err != nil {
if *bestEffort {
log.Printf("failed to unmarshal: %v", string(p))
} else {
return nil, err
}
}
// XXX: we do not need to do any lookup w/o container name anyway.
if *keepResolvedOnly && release.ContainerName == "" {
return nil, nil
}
name := strings.ToLower(release.ContainerName)
results := ms.Lookup(name, -1)
if len(results) != 1 {
// To many or too few matches? We return the document unchanged
if *keepResolvedOnly {
return nil, nil
} else {
return p, nil
}
}
resolved := m[results[0]]
if len(resolved) != 1 {
// Abbreviation mapping to different full names? Not sure how
// common, or why, but skip.
if *keepResolvedOnly {
return nil, nil
} else {
return p, nil
}
}
release.Extra.Skate.ResolvedContainerName = resolved[0]
b, err := json.Marshal(release)
if err != nil {
return nil, err
}
if len(fields) == *column {
// In case this is the last field, we want our newline back.
b = append(b, []byte("\n")...)
}
fields[*column-1] = b
return bytes.Join(fields, []byte(*sep)), nil
})
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
if err := pp.Run(); err != nil {
log.Fatal(err)
}
}
type NameAbbrev struct {
Name string `json:"name"`
Abbrev string `json:"abbrev"`
}
// createAbbreviationMatchSet creates a match set from JSONL abbreviations
// file. It also returns a mapping from abbreviations to full journal names for that abbreviation.
func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) {
f, err := os.Open(filename)
if err != nil {
return nil, nil, err
}
defer f.Close()
var (
sm = make(map[string]set.Set)
m = make(map[string][]string)
br = bufio.NewReader(f)
na NameAbbrev
abbreviations []string
)
for {
b, err := br.ReadBytes('\n')
if err == io.EOF {
break
}
if err != nil {
return nil, nil, err
}
if err := json.Unmarshal(b, &na); err != nil {
return nil, nil, err
}
key := strings.ToLower(na.Abbrev)
if _, ok := sm[key]; !ok {
sm[key] = set.New()
}
sm[key].Add(strings.ToLower(na.Name))
}
for k, v := range sm {
abbreviations = append(abbreviations, k)
m[k] = v.Slice()
}
matchSet := skate.NewMatchSet(abbreviations)
return matchSet, m, nil
}
|