From 4d109a3fd5940ef27ed3e68fdebf0ccd3f234c28 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 28 Sep 2021 13:39:19 +0200 Subject: mag: notes on doi-to-doi refs --- extra/mag/.gitignore | 2 + extra/mag/Makefile | 8 +++- extra/mag/README.md | 57 ++++++++++++++++++++++++-- extra/mag/magrefs-mem.go | 101 +++++++++++++++++++++++++++++++++++++++++++++++ extra/mag/memkey.go | 45 +++++++++++++++++++++ 5 files changed, 208 insertions(+), 5 deletions(-) create mode 100644 extra/mag/magrefs-mem.go create mode 100644 extra/mag/memkey.go diff --git a/extra/mag/.gitignore b/extra/mag/.gitignore index 607a495..2ac8279 100644 --- a/extra/mag/.gitignore +++ b/extra/mag/.gitignore @@ -1 +1,3 @@ magrefs +magrefs-mem +memkey diff --git a/extra/mag/Makefile b/extra/mag/Makefile index 6a19455..f3698dd 100644 --- a/extra/mag/Makefile +++ b/extra/mag/Makefile @@ -1,8 +1,12 @@ SHELL := /bin/bash +TARGETS = magrefs magrefs-mem memkey -magrefs: magrefs.go +.PHONY: all +all: $(TARGETS) + +%: %.go go build -ldflags "-w -s" -o $@ $^ .PHONY: clean clean: - rm -f magrefs + rm -f $(TARGETS) diff --git a/extra/mag/README.md b/extra/mag/README.md index 8f8a7fb..9e20c44 100644 --- a/extra/mag/README.md +++ b/extra/mag/README.md @@ -7,9 +7,60 @@ Using: https://archive.org/details/mag-2021-06-07 In order to generate a doi-to-doi version, we need to: -* create a mapping from id-to-doi +* create a mapping from id-to-doi (db is slow, in-memory needs 16G+ RAM) * apply the mapping to the PaperReferences file -```sh -$ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o /sandcrawler-db/tmp-refcat/mag_id_doi.db ``` +# https://docs.microsoft.com/en-us/academic-services/graph/reference-data-schema +# MAG has a Papers.txt.gz file containing the ID and DOI +# +# (1) create a mapping from id to doi (e.g. in sqlite3) +# (2) turn id-to-id references into doi-to-doi with lookup table +# +# $ unpigz -c /magna/data/mag-2020-06-25/Papers.txt.gz | cut -f1,3 | pv -l +# 238M 0:11:16 [ 353k/s] +# +# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | pv -l > /dev/null +# 260M 0:10:32 [ 412k/s] +# +# $ unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /dev/null +# 96.7M 0:11:05 [ 145k/s] +# +# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | mkocidb -o mag_id_doi.db +# +# 2021/09/27 17:08:45 [ok] initialized database -- /sandcrawler-db/tmp-refcat/mag_id_doi.db +# written 2.9G -- 3.4M/s +# 2021/09/27 17:23:11 import done +# 2021/09/27 17:23:11 creating index +# 2021/09/27 17:26:44 [ok] 1/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db +# 2021/09/27 17:31:53 [ok] 2/2 created index -- /sandcrawler-db/tmp-refcat/mag_id_doi.db +# +# real 23m7.744s +# user 30m55.642s +# sys 4m17.959s +# +# Can use a in memory map, too - sqlite3 lookups for 2B+ items takes a while at +# 30 Kqps; takes 30% of RAM on 48GB, sigh; just map[int]string from paper id to doi. +# +# Prepare the 2-TSV list. +# +# $ time unpigz -c /magna/data/mag-2021-06-07/Papers.txt.gz | cut -f1,3 | awk '$2 != ""' | pv -l > /magna/data/mag-2021-06-07/mapping.tsv +# +# real 15m27.348s +# user 21m4.297s +# sys 3m34.441s +# +# $ time zstdcat -T0 PaperReferences.txt.zst | +# magrefs-mem -f /magna/data/mag-2021-06-07/mapping.tsv | +# pv -l | zstd -c -T0 > doi_refs.tsv.zst +# +# real 41m12.911s +# user 223m14.467s +# sys 30m43.260s +# +# $ zstdcat -T0 doi_refs.tsv.zst| pv -l | wc -l +# 1.32G 0:06:33 [3.34M/s] [ <=> ]1315040677 +# 1315040677 +``` + +Finding 1,315,040,677 DOI-to-DOI mappings. diff --git a/extra/mag/magrefs-mem.go b/extra/mag/magrefs-mem.go new file mode 100644 index 0000000..403e021 --- /dev/null +++ b/extra/mag/magrefs-mem.go @@ -0,0 +1,101 @@ +// magrefs turns MAG references into a doi-to-doi version +// PaperReferences.txt is a two column file: PaperId, PaperReferenceId. +// +// Use in memory lookup table. +package main + +import ( + "bufio" + "flag" + "fmt" + "io" + "log" + "os" + "strconv" + "strings" + + _ "github.com/mattn/go-sqlite3" + "github.com/miku/parallel" +) + +var ( + mappingFile = flag.String("f", "", "two column TSV with key and value") + cache = make(map[int]string) +) + +func populateCache(r io.Reader, cache map[int]string) error { + var ( + br = bufio.NewReader(r) + i int + ) + for { + line, err := br.ReadString('\n') + if err == io.EOF { + break + } + if err != nil { + return err + } + fields := strings.Split(line, "\t") + if len(fields) < 2 { + continue + } + a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) + id, err := strconv.Atoi(a) + if err != nil { + log.Println("skipping invalid id: %s", line) + continue + } + cache[id] = b + i++ + if i%1000000 == 0 { + log.Printf("%d", i) + } + } + return nil +} + +func main() { + flag.Parse() + if *mappingFile == "" { + log.Fatal("mapping file required") + } + f, err := os.Open(*mappingFile) + if err != nil { + log.Fatal(err) + } + defer f.Close() + if err := populateCache(f, cache); err != nil { + log.Fatal(err) + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, func(p []byte) ([]byte, error) { + fields := strings.Split(string(p), "\t") + if len(fields) < 2 { + return nil, nil + } + s, t := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) + source, err := strconv.Atoi(s) + if err != nil { + log.Printf("skipping invalid source id: %s (%v)", s, err) + return nil, nil + } + target, err := strconv.Atoi(t) + if err != nil { + log.Printf("skipping invalid target id: %s (%v)", t, err) + return nil, nil + } + v, ok := cache[source] + if !ok { + return nil, nil + } + w, ok := cache[target] + if !ok { + return nil, nil + } + line := fmt.Sprintf("%s\t%s\n", v, w) + return []byte(line), nil + }) + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} diff --git a/extra/mag/memkey.go b/extra/mag/memkey.go new file mode 100644 index 0000000..b2e4b42 --- /dev/null +++ b/extra/mag/memkey.go @@ -0,0 +1,45 @@ +package main + +import ( + "bufio" + "fmt" + "io" + "log" + "os" + "strconv" + "strings" +) + +func main() { + var ( + cache = make(map[int]string) + br = bufio.NewReader(os.Stdin) + i int + ) + for { + line, err := br.ReadString('\n') + if err == io.EOF { + break + } + if err != nil { + log.Fatal(err) + } + fields := strings.Split(line, "\t") + if len(fields) < 2 { + continue + } + a, b := strings.TrimSpace(fields[0]), strings.TrimSpace(fields[1]) + id, err := strconv.Atoi(a) + if err != nil { + log.Println("skipping invalid id: %s", line) + continue + } + cache[id] = b + i++ + if i%1000000 == 0 { + log.Printf("%d", i) + } + } + log.Println("press enter to quit") + fmt.Scanln() +} -- cgit v1.2.3