// skate-resolve-journal-name works on TSV where one column (-f, 1-indexed) is // a release schema (json). It will inspect the container name and will resolve // journal name abbreviations and will augment the release document with data // in extra containing the resolved name (e.g. in // extra.skate.resolved_container_name). // // The abbreviation file format currently is a JSONL format, each document // containing a "name" (required) and "abbrev" (optional), e.g. like: // https://git.archive.org/martin/cgraph/-/raw/master/extra/abbrev/jabbrev.json. package main import ( "bufio" "bytes" "flag" "fmt" "io" "log" "os" "runtime" "strings" "github.com/segmentio/encoding/json" "gitlab.com/internetarchive/refcat/skate" "gitlab.com/internetarchive/refcat/skate/parallel" "gitlab.com/internetarchive/refcat/skate/set" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") column = flag.Int("f", 2, "column to find the release schema document in (1-indexed)") sep = flag.String("d", "\t", "delimiter to use") bestEffort = flag.Bool("B", false, "only log errors, do not halt") abbrevFile = flag.String("A", "", "path to abbreviations file") keepResolvedOnly = flag.Bool("R", false, "keep only lines, where we could resolve an abbreviation") ) func main() { flag.Parse() ms, m, err := createAbbreviationMatchSet(*abbrevFile) if err != nil { log.Fatal(err) } log.Printf("found %d abbreviation mappings", len(m)) pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { var ( release *skate.Release err error ) fields := bytes.Split(p, []byte(*sep)) if *column > len(fields) { return nil, fmt.Errorf("column mismatch") } if err = json.Unmarshal(fields[*column-1], &release); err != nil { if *bestEffort { log.Printf("failed to unmarshal: %v", string(p)) } else { return nil, err } } // XXX: we do not need to do any lookup w/o container name anyway. if *keepResolvedOnly && release.ContainerName == "" { return nil, nil } name := strings.ToLower(release.ContainerName) results := ms.Lookup(name, -1) if len(results) != 1 { // To many or too few matches? We return the document unchanged if *keepResolvedOnly { return nil, nil } else { return p, nil } } resolved := m[results[0]] if len(resolved) != 1 { // Abbreviation mapping to different full names? Not sure how // common, or why, but skip. if *keepResolvedOnly { return nil, nil } else { return p, nil } } release.Extra.Skate.ResolvedContainerName = resolved[0] b, err := json.Marshal(release) if err != nil { return nil, err } if len(fields) == *column { // In case this is the last field, we want our newline back. b = append(b, []byte("\n")...) } fields[*column-1] = b return bytes.Join(fields, []byte(*sep)), nil }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { log.Fatal(err) } } // createAbbreviationMatchSet creates a match set from JSONL abbreviations // file. It also returns a mapping from abbreviations to full journal names for // that abbreviation. func createAbbreviationMatchSet(filename string) (*skate.MatchSet, map[string][]string, error) { f, err := os.Open(filename) if err != nil { return nil, nil, err } defer f.Close() var ( sm = make(map[string]set.Set) m = make(map[string][]string) br = bufio.NewReader(f) nameAbbrev struct { Name string `json:"name"` Abbrev string `json:"abbrev"` } abbrevs []string key string ) for { b, err := br.ReadBytes('\n') if err == io.EOF { break } if err != nil { return nil, nil, err } if err := json.Unmarshal(b, &nameAbbrev); err != nil { return nil, nil, err } key = strings.ToLower(nameAbbrev.Abbrev) if _, ok := sm[key]; !ok { sm[key] = set.New() } sm[key].Add(strings.ToLower(nameAbbrev.Name)) } for k, v := range sm { abbrevs = append(abbrevs, k) m[k] = v.Slice() } matchSet := skate.NewMatchSet(abbrevs) return matchSet, m, nil }