skate/cmd/skate-resolve-journal-name/main.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

// skate-resolve-journal-name works on TSV where one column (-f, 1-indexed) is
// a release schema (json). It will inspect the container name and will resolve
// journal name abbreviations and will augment the release document with data
// in extra containing the resolved name (e.g. in
// extra.skate.resolved_container_name).
//
// The abbreviation file format currently is a JSONL format, each document
// containing a "name" (required) and "abbrev" (optional), e.g. like:
// https://git.archive.org/martin/cgraph/-/raw/master/extra/abbrev/jabbrev.json.
package main

import (
	"bufio"
	"bytes"
	"flag"
	"fmt"
	"io"
	"log"
	"os"
	"runtime"
	"strings"

	"github.com/segmentio/encoding/json"
	"gitlab.com/internetarchive/refcat/skate"
	"gitlab.com/internetarchive/refcat/skate/parallel"
	"gitlab.com/internetarchive/refcat/skate/set"
)

var (
	numWorkers       = flag.Int("w", runtime.NumCPU(), "number of workers")
	batchSize        = flag.Int("b", 100000, "batch size")
	column           = flag.Int("f", 2, "column to find the release schema document in (1-indexed)")
	sep              = flag.String("d", "\t", "delimiter to use")
	bestEffort       = flag.Bool("B", false, "only log errors, do not halt")
	abbrevFile       = flag.String("A", "", "path to abbreviations file")
	keepResolvedOnly = flag.Bool("R", false, "keep only lines, where we could resolve an abbreviation")
)

func main() {
	flag.Parse()
	ms, m, err := createAbbreviationMatchSet(*abbrevFile)
	if err != nil {
		log.Fatal(err)
	}
	log.Printf("found %d abbreviation mappings", len(m))
	pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
		var (
			release *skate.Release
			err     error
		)
		fields := bytes.Split(p, []byte(*sep))
		if *column > len(fields) {
			return nil, fmt.Errorf("column mismatch")
		}
		if err = json.Unmarshal(fields[*column-1], &release); err != nil {
			if *bestEffort {
				log.Printf("failed to unmarshal: %v", string(p))
			} else {
				return nil, err
			}
		}
		// XXX: we do not need to do any lookup w/o container name anyway.
		if *keepResolvedOnly && release.ContainerName == "" {
			return nil, nil
		}
		name := strings.ToLower(release.ContainerName)
		results := ms.Lookup(name, -1)
		if len(results) != 1 {
			// To many or too few matches? We return the document unchanged
			if *keepResolvedOnly {
				return nil, nil
			} else {
				return p, nil
			}
		}
		resolved := m[results[0]]
		if len(resolved) != 1 {
			// Abbreviation mapping to different full names? Not sure how
			// common, or why, but skip.
			if *keepResolvedOnly {
				return nil, nil
			} else {
				return p, nil
			}
		}
		release.Extra.Skate.ResolvedContainerName = resolved[0]
		b, err := json.Marshal(release)
		if err != nil {
			return nil, err
		}
		if len(fields) == *column {
			// In case this is the last field, we want our newline back.
			b = append(b, []byte("\n")...)
		}
		fields[*column-1] = b
		return bytes.Join(fields, []byte(*sep)), nil
	})
	pp.NumWorkers = *numWorkers
	pp.BatchSize = *batchSize
	if err := pp.Run(); err != nil {
		log.Fatal(err)
	}
}

// createAbbreviationMatchSet creates a match set from JSONL abbreviations
// file. It also returns a mapping from abbreviations to full journal names for
// that abbreviation.
func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) {
	f, err := os.Open(filename)
	if err != nil {
		return nil, nil, err
	}
	defer f.Close()
	var (
		sm         = make(map[string]set.Set)
		m          = make(map[string][]string)
		br         = bufio.NewReader(f)
		nameAbbrev struct {
			Name   string `json:"name"`
			Abbrev string `json:"abbrev"`
		}
		abbrevs []string
		key     string
	)
	for {
		b, err := br.ReadBytes('\n')
		if err == io.EOF {
			break
		}
		if err != nil {
			return nil, nil, err
		}
		if err := json.Unmarshal(b, &nameAbbrev); err != nil {
			return nil, nil, err
		}
		key = strings.ToLower(nameAbbrev.Abbrev)
		if _, ok := sm[key]; !ok {
			sm[key] = set.New()
		}
		sm[key].Add(strings.ToLower(nameAbbrev.Name))
	}
	for k, v := range sm {
		abbrevs = append(abbrevs, k)
		m[k] = v.Slice()
	}
	matchSet := skate.NewMatchSet(abbrevs)
	return matchSet, m, nil
}