// magrefs turns MAG references into a doi-to-doi version // PaperReferences.txt is a two column file: PaperId, PaperReferenceId. // // Use in memory lookup table. package main import ( "bufio" "flag" "fmt" "io" "log" "os" "strconv" "strings" _ "github.com/mattn/go-sqlite3" "github.com/miku/parallel" ) var ( mappingFile = flag.String("f", "", "two column TSV with key and value") cache = make(map[int]string) ) func populateCache(r io.Reader, cache map[int]string) error { var ( br = bufio.NewReader(r) i int ) for { line, err := br.ReadString('\n') if err == io.EOF { break } if err != nil { return err } fields := strings.Split(line, "\t") if len(fields) < 2 { continue } a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) id, err := strconv.Atoi(a) if err != nil { log.Println("skipping invalid id: %s", line) continue } cache[id] = b i++ if i%1000000 == 0 { log.Printf("%d", i) } } return nil } func main() { flag.Parse() if *mappingFile == "" { log.Fatal("mapping file required") } f, err := os.Open(*mappingFile) if err != nil { log.Fatal(err) } defer f.Close() if err := populateCache(f, cache); err != nil { log.Fatal(err) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { fields := strings.Split(string(p), "\t") if len(fields) < 2 { return nil, nil } s, t := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) source, err := strconv.Atoi(s) if err != nil { log.Printf("skipping invalid source id: %s (%v)", s, err) return nil, nil } target, err := strconv.Atoi(t) if err != nil { log.Printf("skipping invalid target id: %s (%v)", t, err) return nil, nil } v, ok := cache[source] if !ok { return nil, nil } w, ok := cache[target] if !ok { return nil, nil } line := fmt.Sprintf("%s\t%s\n", v, w) return []byte(line), nil }) if err := pp.Run(); err != nil { log.Fatal(err) } }