aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-resolve-journal-name/main.go
blob: d98f166da43b2116f2e3925377fe67396a9bb718 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// skate-resolve-journal-name works on TSV where one column (-f) is a release
// schema. It will inspect the container name and will resolve journal name
// abbreviations and will augment the release document with data in extra
// containing the resolved name.
//
// The abbreviation file format currently is a JSONL format, each document
// containing a "name" and "abbrev" (optional).
package main

import (
	"bufio"
	"bytes"
	"encoding/json"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"runtime"
	"strings"

	"git.archive.org/martin/cgraph/skate"
	"git.archive.org/martin/cgraph/skate/parallel"
	"git.archive.org/martin/cgraph/skate/set"
)

var (
	numWorkers       = flag.Int("w", runtime.NumCPU(), "number of workers")
	batchSize        = flag.Int("b", 100000, "batch size")
	column           = flag.Int("f", 2, "column to find the release schema document in (1-indexed)")
	sep              = flag.String("d", "\t", "delimiter to use")
	bestEffort       = flag.Bool("B", false, "only log errors, do not halt")
	abbrevFile       = flag.String("A", "", "path to abbreviate file")
	keepResolvedOnly = flag.Bool("R", false, "keep only lines, where we could resolve an abbreviation")
)

func main() {
	flag.Parse()
	ms, m, err := createAbbreviationMatchSet(*abbrevFile)
	if err != nil {
		log.Fatal(err)
	}
	log.Printf("found %d abbreviation mappings", len(m))
	pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
		var (
			release *skate.Release
			err     error
		)
		fields := bytes.Split(p, []byte(*sep))
		if *column > len(fields) {
			return nil, fmt.Errorf("column mismatch")
		}
		if err = json.Unmarshal(fields[*column-1], &release); err != nil {
			if *bestEffort {
				log.Printf("failed to unmarshal: %v", string(p))
			} else {
				return nil, err
			}
		}
		name := strings.ToLower(release.ContainerName)
		results := ms.Lookup(name, -1)
		if len(results) != 1 {
			// To many or too few matches? We return the document unchanged
			if *keepResolvedOnly {
				return nil, nil
			} else {
				return p, nil
			}
		}
		resolved := m[results[0]]
		if len(resolved) != 1 {
			// Abbreviation mapping to different full names? Not sure how
			// common, or why, but skip.
			if *keepResolvedOnly {
				return nil, nil
			} else {
				return p, nil
			}
		}
		release.Extra.Skate.ResolvedContainerName = resolved[0]
		b, err := json.Marshal(release)
		if err != nil {
			return nil, err
		}
		if len(fields) == *column {
			// In case this is the last field, we want our newline back.
			b = append(b, []byte("\n")...)
		}
		fields[*column-1] = b
		return bytes.Join(fields, []byte(*sep)), nil
	})
	pp.NumWorkers = *numWorkers
	pp.BatchSize = *batchSize
	if err := pp.Run(); err != nil {
		log.Fatal(err)
	}
}

type NameAbbrev struct {
	Name   string `json:"name"`
	Abbrev string `json:"abbrev"`
}

// createAbbreviationMatchSet creates a match set from JSONL abbreviations
// file. It also returns a mapping from abbreviations to full journal names for that abbreviation.
func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, nil, err
	}
	defer f.Close()
	var (
		sm            = make(map[string]set.Set)
		m             = make(map[string][]string)
		br            = bufio.NewReader(f)
		na            NameAbbrev
		abbreviations []string
	)
	for {
		b, err := br.ReadBytes('\n')
		if err == io.EOF {
			break
		}
		if err != nil {
			return nil, nil, err
		}
		if err := json.Unmarshal(b, &na); err != nil {
			return nil, nil, err
		}
		key := strings.ToLower(na.Abbrev)
		if _, ok := sm[key]; !ok {
			sm[key] = set.New()
		}
		sm[key].Add(strings.ToLower(na.Name))
	}
	for k, v := range sm {
		abbreviations = append(abbreviations, k)
		m[k] = v.Slice()
	}
	matchSet := skate.NewMatchSet(abbreviations)
	return matchSet, m, nil
}