aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-09-28 13:39:19 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-09-28 13:39:19 +0200
commit4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28 (patch)
treeb28505d2a5bee9b60477260edb2dc664510b4c93
parent3af89868253a45f4e3fe912443276b31b7c72521 (diff)
downloadrefcat-4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28.tar.gz
refcat-4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28.zip
mag: notes on doi-to-doi refs
-rw-r--r--extra/mag/.gitignore2
-rw-r--r--extra/mag/Makefile8
-rw-r--r--extra/mag/README.md57
-rw-r--r--extra/mag/magrefs-mem.go101
-rw-r--r--extra/mag/memkey.go45
5 files changed, 208 insertions, 5 deletions
diff --git a/extra/mag/.gitignore b/extra/mag/.gitignore
index 607a495..2ac8279 100644
--- a/extra/mag/.gitignore
+++ b/extra/mag/.gitignore
@@ -1 +1,3 @@
magrefs
+magrefs-mem
+memkey
diff --git a/extra/mag/Makefile b/extra/mag/Makefile
index 6a19455..f3698dd 100644
--- a/extra/mag/Makefile
+++ b/extra/mag/Makefile
@@ -1,8 +1,12 @@
SHELL := /bin/bash
+TARGETS = magrefs magrefs-mem memkey
-magrefs: magrefs.go
+.PHONY: all
+all: $(TARGETS)
+
+%: %.go
go build -ldflags "-w -s" -o $@ $^
.PHONY: clean
clean:
- rm -f magrefs
+ rm -f $(TARGETS)
diff --git a/extra/mag/README.md b/extra/mag/README.md
index 8f8a7fb..9e20c44 100644
--- a/extra/mag/README.md
+++ b/extra/mag/README.md
@@ -7,9 +7,60 @@ Using: https://archive.org/details/mag-2021-06-07
In order to generate a doi-to-doi version, we need to:
-* create a mapping from id-to-doi
+* create a mapping from id-to-doi (db is slow, in-memory needs 16G+ RAM)
* apply the mapping to the PaperReferences file
-```sh
-$ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o /sandcrawler-db/tmp-refcat/mag_id_doi.db
```
+# https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema
+# MAG has a Papers.txt.gz file containing the ID and DOI
+#
+# (1) create a mapping from id to doi (e.g. in sqlite3)
+# (2) turn id-to-id references into doi-to-doi with lookup table
+#
+# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f1,3 | pv -l
+# 238M 0:11:16 [ 353k/s]
+#
+# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | pv -l > /dev/null
+# 260M 0:10:32 [ 412k/s]
+#
+# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null
+# 96.7M 0:11:05 [ 145k/s]
+#
+# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db
+#
+# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
+# written 2.9G -- 3.4M/s
+# 2021/09/27 17:23:11 import done
+# 2021/09/27 17:23:11 creating index
+# 2021/09/27 17:26:44 [ok] 1/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
+# 2021/09/27 17:31:53 [ok] 2/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db
+#
+# real 23m7.744s
+# user 30m55.642s
+# sys 4m17.959s
+#
+# Can use a in memory map, too - sqlite3 lookups for 2B+ items takes a while at
+# 30 Kqps; takes 30% of RAM on 48GB, sigh; just map[int]string from paper id to doi.
+#
+# Prepare the 2-TSV list.
+#
+# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /magna/data/mag-2021-06-07/mapping.tsv
+#
+# real 15m27.348s
+# user 21m4.297s
+# sys 3m34.441s
+#
+# $ time zstdcat -T0 PaperReferences.txt.zst |
+# magrefs-mem -f /magna/data/mag-2021-06-07/mapping.tsv |
+# pv -l | zstd -c -T0 > doi_refs.tsv.zst
+#
+# real 41m12.911s
+# user 223m14.467s
+# sys 30m43.260s
+#
+# $ zstdcat -T0 doi_refs.tsv.zst| pv -l | wc -l
+# 1.32G 0:06:33 [3.34M/s] [ <=> ]1315040677
+# 1315040677
+```
+
+Finding 1,315,040,677 DOI-to-DOI mappings.
diff --git a/extra/mag/magrefs-mem.go b/extra/mag/magrefs-mem.go
new file mode 100644
index 0000000..403e021
--- /dev/null
+++ b/extra/mag/magrefs-mem.go
@@ -0,0 +1,101 @@
+// magrefs turns MAG references into a doi-to-doi version
+// PaperReferences.txt is a two column file: PaperId, PaperReferenceId.
+//
+// Use in memory lookup table.
+package main
+
+import (
+ "bufio"
+ "flag"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strconv"
+ "strings"
+
+ _ "github.com/mattn/go-sqlite3"
+ "github.com/miku/parallel"
+)
+
+var (
+ mappingFile = flag.String("f", "", "two column TSV with key and value")
+ cache = make(map[int]string)
+)
+
+func populateCache(r io.Reader, cache map[int]string) error {
+ var (
+ br = bufio.NewReader(r)
+ i int
+ )
+ for {
+ line, err := br.ReadString('\n')
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ return err
+ }
+ fields := strings.Split(line, "\t")
+ if len(fields) < 2 {
+ continue
+ }
+ a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1])
+ id, err := strconv.Atoi(a)
+ if err != nil {
+ log.Println("skipping invalid id: %s", line)
+ continue
+ }
+ cache[id] = b
+ i++
+ if i%1000000 == 0 {
+ log.Printf("%d", i)
+ }
+ }
+ return nil
+}
+
+func main() {
+ flag.Parse()
+ if *mappingFile == "" {
+ log.Fatal("mapping file required")
+ }
+ f, err := os.Open(*mappingFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ if err := populateCache(f, cache); err != nil {
+ log.Fatal(err)
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) {
+ fields := strings.Split(string(p), "\t")
+ if len(fields) < 2 {
+ return nil, nil
+ }
+ s, t := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1])
+ source, err := strconv.Atoi(s)
+ if err != nil {
+ log.Printf("skipping invalid source id: %s (%v)", s, err)
+ return nil, nil
+ }
+ target, err := strconv.Atoi(t)
+ if err != nil {
+ log.Printf("skipping invalid target id: %s (%v)", t, err)
+ return nil, nil
+ }
+ v, ok := cache[source]
+ if !ok {
+ return nil, nil
+ }
+ w, ok := cache[target]
+ if !ok {
+ return nil, nil
+ }
+ line := fmt.Sprintf("%s\t%s\n", v, w)
+ return []byte(line), nil
+ })
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
diff --git a/extra/mag/memkey.go b/extra/mag/memkey.go
new file mode 100644
index 0000000..b2e4b42
--- /dev/null
+++ b/extra/mag/memkey.go
@@ -0,0 +1,45 @@
+package main
+
+import (
+ "bufio"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "strconv"
+ "strings"
+)
+
+func main() {
+ var (
+ cache = make(map[int]string)
+ br = bufio.NewReader(os.Stdin)
+ i int
+ )
+ for {
+ line, err := br.ReadString('\n')
+ if err == io.EOF {
+ break
+ }
+ if err != nil {
+ log.Fatal(err)
+ }
+ fields := strings.Split(line, "\t")
+ if len(fields) < 2 {
+ continue
+ }
+ a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1])
+ id, err := strconv.Atoi(a)
+ if err != nil {
+ log.Println("skipping invalid id: %s", line)
+ continue
+ }
+ cache[id] = b
+ i++
+ if i%1000000 == 0 {
+ log.Printf("%d", i)
+ }
+ }
+ log.Println("press enter to quit")
+ fmt.Scanln()
+}