aboutsummaryrefslogtreecommitdiffstats
path: root/extra/mag/magrefs-mem.go
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-28 13:39:19 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-28 13:39:19 +0200
commit4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28 (patch)
treeb28505d2a5bee9b60477260edb2dc664510b4c93 /extra/mag/magrefs-mem.go
parent3af89868253a45f4e3fe912443276b31b7c72521 (diff)
downloadrefcat-4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28.tar.gz
refcat-4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28.zip
mag: notes on doi-to-doi refs
Diffstat (limited to 'extra/mag/magrefs-mem.go')
-rw-r--r--extra/mag/magrefs-mem.go101
1 files changed, 101 insertions, 0 deletions
diff --git a/extra/mag/magrefs-mem.go b/extra/mag/magrefs-mem.go
new file mode 100644
index 0000000..403e021
--- /dev/null
+++ b/extra/mag/magrefs-mem.go
@@ -0,0 +1,101 @@
+// magrefs turns MAG references into a doi-to-doi version
+// PaperReferences.txt is a two column file: PaperId, PaperReferenceId.
+//
+// Use in memory lookup table.
+package main
+
+import (
+ "bufio"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strconv"
+ "strings"
+
+ _ "github.com/mattn/go-sqlite3"
+ "github.com/miku/parallel"
+)
+
+var (
+ mappingFile = flag.String("f", "", "two column TSV with key and value")
+ cache = make(map[int]string)
+)
+
+func populateCache(r io.Reader, cache map[int]string) error {
+ var (
+ br = bufio.NewReader(r)
+ i int
+ )
+ for {
+ line, err := br.ReadString('\n')
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ return err
+ }
+ fields := strings.Split(line, "\t")
+ if len(fields) < 2 {
+ continue
+ }
+ a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1])
+ id, err := strconv.Atoi(a)
+ if err != nil {
+ log.Println("skipping invalid id: %s", line)
+ continue
+ }
+ cache[id] = b
+ i++
+ if i%1000000 == 0 {
+ log.Printf("%d", i)
+ }
+ }
+ return nil
+}
+
+func main() {
+ flag.Parse()
+ if *mappingFile == "" {
+ log.Fatal("mapping file required")
+ }
+ f, err := os.Open(*mappingFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ if err := populateCache(f, cache); err != nil {
+ log.Fatal(err)
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
+ fields := strings.Split(string(p), "\t")
+ if len(fields) < 2 {
+ return nil, nil
+ }
+ s, t := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1])
+ source, err := strconv.Atoi(s)
+ if err != nil {
+ log.Printf("skipping invalid source id: %s (%v)", s, err)
+ return nil, nil
+ }
+ target, err := strconv.Atoi(t)
+ if err != nil {
+ log.Printf("skipping invalid target id: %s (%v)", t, err)
+ return nil, nil
+ }
+ v, ok := cache[source]
+ if !ok {
+ return nil, nil
+ }
+ w, ok := cache[target]
+ if !ok {
+ return nil, nil
+ }
+ line := fmt.Sprintf("%s\t%s\n", v, w)
+ return []byte(line), nil
+ })
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}