// magrefs turns MAG references into a doi-to-doi version // PaperReferences.txt is a two column file: PaperId, PaperReferenceId. // // Use in memory lookup table. package main import ( "bufio" "flag" "fmt" "io" "log" "os" "strconv" "strings" "sync/atomic" _ "github.com/mattn/go-sqlite3" "github.com/miku/parallel" ) var ( mappingFile = flag.String("f", "", "two column TSV with key and value") cache = make(map[int]string) edgeMissingDOI int64 // number of edges that have one or zero DOI, but not both ) func populateCache(r io.Reader, cache map[int]string) error { var ( br = bufio.NewReader(r) i int ) for { line, err := br.ReadString('\n') if err == io.EOF { break } if err != nil { return err } fields := strings.Split(line, "\t") if len(fields) < 2 { continue } a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) id, err := strconv.Atoi(a) if err != nil { log.Println("skipping invalid id: %s", line) continue } cache[id] = b i++ if i%1000000 == 0 { log.Printf("%d", i) } } return nil } func main() { flag.Parse() if *mappingFile == "" { log.Fatal("mapping file required") } f, err := os.Open(*mappingFile) if err != nil { log.Fatal(err) } defer f.Close() if err := populateCache(f, cache); err != nil { log.Fatal(err) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { var ( s, t, v, w string source, target int err error ok bool complete = true fields = strings.Split(string(p), "\t") ) if len(fields) < 2 { return nil, nil } s, t = strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) if source, err = strconv.Atoi(s); err != nil { log.Printf("skipping invalid source id: %s (%v)", s, err) return nil, nil } if target, err = strconv.Atoi(t); err != nil { log.Printf("skipping invalid target id: %s (%v)", t, err) return nil, nil } if v, ok = cache[source]; !ok { complete = false } if w, ok = cache[target]; !ok { complete = false } if !complete { atomic.AddInt64(&edgeMissingDOI, 1) return nil, nil } line := fmt.Sprintf("%s\t%s\n", v, w) return []byte(line), nil }) if err := pp.Run(); err != nil { log.Fatal(err) } log.Printf("edges without DOI: %d", edgeMissingDOI) }