// skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster) // and applies various verification and conversion functions. The output will // often be the biblioref schema. // // Support various modes. // // * exact: takes (key, doc) TSV files (one for releases, one for refs) and // will emit biblioref docs relating one element from releases with all // elements from ref; this is for "doi", "pmid" and other id matches, where no // further checks are necessary. The match reason, e.g. "doi" needs to be // supplied. // // $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv // // * verify: takes (key, doc) TSV files (one for release, one for refs), runs // verification within a group and will emit biblioref. // // $ skate-reduce -m verify -F a.tsv -L b.tsv // // * ref: takes a single file with clusters containing releases and refs and // will emit verification results. // // $ skate-reduce -m ref < a.ndj // // * bref: same as ref, but generate a biblioref file as output // // $ skate-reduce -m bref < a.ndj // // * wiki: zippy mode for releases and wikipedia inputs. // // $ skate-reduce -m wiki -L a.ndj -W b.ndj // package main import ( "bufio" "flag" "log" "os" "runtime" "runtime/pprof" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" "git.archive.org/martin/cgraph/skate/xio" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 10000, "batch size") // Each mode may work on one or two files, and may need extra args. mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file") memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)") // Possible inputs, we could switch to a subcommand cli parser. refs = flag.String("F", "", "path to refs input") releases = flag.String("L", "", "path to release input") wiki = flag.String("W", "", "path to wiki input") // Extra args. reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown") reasonMap = map[string]skate.MatchResult{ "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI}, "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID}, "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID}, "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv}, "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown}, } ) func main() { flag.Parse() if *cpuProfile != "" { file, err := os.Create(*cpuProfile) if err != nil { log.Fatal(err) } pprof.StartCPUProfile(file) defer pprof.StopCPUProfile() } bw := bufio.NewWriter(os.Stdout) defer bw.Flush() switch *mode { case "exact": l, f, err := xio.OpenTwo(*releases, *refs) if err != nil { log.Fatal(err) } r, ok := reasonMap[*reason] if !ok { log.Fatalf("unknown reason: %v", *reason) } if err := skate.ZippyExact(l, f, r, bw); err != nil { log.Fatal(err) } case "verify": l, f, err := xio.OpenTwo(*releases, *refs) if err != nil { log.Fatal(err) } if err := skate.ZippyVerifyRefs(l, f, bw); err != nil { log.Fatal(err) } case "ref": pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { log.Fatal(err) } case "bref": pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { log.Fatal(err) } case "wiki": l, w, err := xio.OpenTwo(*releases, *wiki) if err != nil { log.Fatal(err) } if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil { log.Fatal(err) } default: log.Fatalf("invalid mode") } if *memProfile != "" { f, err := os.Create(*memProfile) if err != nil { log.Fatal("could not create memory profile: ", err) } defer f.Close() runtime.GC() if err := pprof.WriteHeapProfile(f); err != nil { log.Fatal(err) } } }