diff options
| author | Martin Czygan <martin.czygan@gmail.com> | 2021-09-28 13:39:19 +0200 |
|---|---|---|
| committer | Martin Czygan <martin.czygan@gmail.com> | 2021-09-28 13:39:19 +0200 |
| commit | 4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28 (patch) | |
| tree | b28505d2a5bee9b60477260edb2dc664510b4c93 /extra/mag/magrefs-mem.go | |
| parent | 3af89868253a45f4e3fe912443276b31b7c72521 (diff) | |
| download | refcat-4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28.tar.gz refcat-4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28.zip | |
mag: notes on doi-to-doi refs
Diffstat (limited to 'extra/mag/magrefs-mem.go')
| -rw-r--r-- | extra/mag/magrefs-mem.go | 101 |
1 files changed, 101 insertions, 0 deletions
diff --git a/extra/mag/magrefs-mem.go b/extra/mag/magrefs-mem.go new file mode 100644 index 0000000..403e021 --- /dev/null +++ b/extra/mag/magrefs-mem.go @@ -0,0 +1,101 @@ +// magrefs turns MAG references into a doi-to-doi version +// PaperReferences.txt is a two column file: PaperId, PaperReferenceId. +// +// Use in memory lookup table. +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "strconv" + "strings" + + _ "github.com/mattn/go-sqlite3" + "github.com/miku/parallel" +) + +var ( + mappingFile = flag.String("f", "", "two column TSV with key and value") + cache = make(map[int]string) +) + +func populateCache(r io.Reader, cache map[int]string) error { + var ( + br = bufio.NewReader(r) + i int + ) + for { + line, err := br.ReadString('\n') + if err == io.EOF { + break + } + if err != nil { + return err + } + fields := strings.Split(line, "\t") + if len(fields) < 2 { + continue + } + a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) + id, err := strconv.Atoi(a) + if err != nil { + log.Println("skipping invalid id: %s", line) + continue + } + cache[id] = b + i++ + if i%1000000 == 0 { + log.Printf("%d", i) + } + } + return nil +} + +func main() { + flag.Parse() + if *mappingFile == "" { + log.Fatal("mapping file required") + } + f, err := os.Open(*mappingFile) + if err != nil { + log.Fatal(err) + } + defer f.Close() + if err := populateCache(f, cache); err != nil { + log.Fatal(err) + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { + fields := strings.Split(string(p), "\t") + if len(fields) < 2 { + return nil, nil + } + s, t := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) + source, err := strconv.Atoi(s) + if err != nil { + log.Printf("skipping invalid source id: %s (%v)", s, err) + return nil, nil + } + target, err := strconv.Atoi(t) + if err != nil { + log.Printf("skipping invalid target id: %s (%v)", t, err) + return nil, nil + } + v, ok := cache[source] + if !ok { + return nil, nil + } + w, ok := cache[target] + if !ok { + return nil, nil + } + line := fmt.Sprintf("%s\t%s\n", v, w) + return []byte(line), nil + }) + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} |
