aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-verify/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cmd/skate-verify/main.go')
-rw-r--r--skate/cmd/skate-verify/main.go158
1 files changed, 0 insertions, 158 deletions
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
deleted file mode 100644
index 1288404..0000000
--- a/skate/cmd/skate-verify/main.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Generate pairs and run verification on larger number of records. Mimick
-// fuzzycat.verify, but make it faster (e.g. fuzzycat took about 50h for the
-// complete set).
-//
-// Currently: about 2h for 40M clusters (in "ref" mode).
-//
-// XXX: Cleanup inconsistent "modes".
-package main
-
-import (
- "bufio"
- "flag"
- "io"
- "log"
- "os"
- "runtime"
- "runtime/pprof"
- "strings"
-
- "git.archive.org/martin/cgraph/skate"
- "git.archive.org/martin/cgraph/skate/parallel"
-)
-
-var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 10000, "batch size")
- mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip, wiki")
- exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv")
- provenance = flag.String("p", "join", "provenance info")
- wikiFile = flag.String("W", "", "wiki citation file")
- releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)")
- refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)")
- cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
- memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
-
- // XXX: This should be cleanup up soon.
- matchResults = map[string]skate.MatchResult{
- "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI},
- "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID},
- "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID},
- "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv},
- "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown},
- }
-)
-
-func main() {
- flag.Parse()
- if *cpuProfile != "" {
- file, err := os.Create(*cpuProfile)
- if err != nil {
- log.Fatal(err)
- }
- pprof.StartCPUProfile(file)
- defer pprof.StopCPUProfile()
- }
- var (
- f, g io.ReadCloser
- err error
- bw = bufio.NewWriter(os.Stdout)
- )
- defer bw.Flush()
- switch *mode {
- case "exact":
- // Fixed zip mode for DOI.
- if *refsFile == "" || *releasesFile == "" {
- log.Fatal("mode requires -R and -F to be set")
- }
- if *exactReason == "" {
- var keys []string
- for k := range matchResults {
- keys = append(keys, k)
- }
- log.Fatalf("need a reason for the record, one of: %s", strings.Join(keys, ", "))
- }
- if f, g, err = readersFromFilenames(*releasesFile, *refsFile); err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- defer g.Close()
- mr, ok := matchResults[*exactReason]
- if !ok {
- mr = matchResults["unknown"]
- }
- if err := skate.ZippyFixed(f, g, mr, *provenance, bw); err != nil {
- log.Fatal(err)
- }
- case "zip":
- // Take two "sorted key files" (one refs, one releases) and run
- // verification across groups, generate biblioref file.
- if *refsFile == "" || *releasesFile == "" {
- log.Fatal("zip mode requires -F and -R to be set")
- }
- if f, g, err = readersFromFilenames(*releasesFile, *refsFile); err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- defer g.Close()
- if err := skate.ZippyVerifyRefs(f, g, bw); err != nil {
- log.Fatal(err)
- }
- case "ref":
- // https://git.io/JtACz
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "bref":
- // generate biblioref
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "wiki":
- // Fixed zip mode for DOI from wikipedia.
- if *wikiFile == "" || *releasesFile == "" {
- log.Fatal("mode requires -W and -F to be set")
- }
- if f, g, err = readersFromFilenames(*releasesFile, *wikiFile); err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- defer g.Close()
- mr := skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI}
- if err = skate.ZippyFixedWiki(f, g, mr, "wiki", bw); err != nil {
- log.Fatal(err)
- }
- default:
- log.Fatal("not implemented, only: exact, zip, ref, bref, wiki")
- }
- if *memProfile != "" {
- f, err := os.Create(*memProfile)
- if err != nil {
- log.Fatal("could not create memory profile: ", err)
- }
- defer f.Close()
- runtime.GC()
- if err := pprof.WriteHeapProfile(f); err != nil {
- log.Fatal(err)
- }
- }
-}
-
-// readersFromFilenames lets the called check for a single error only.
-func readersFromFilenames(f0, f1 string) (io.ReadCloser, io.ReadCloser, error) {
- f, err := os.Open(f0)
- if err != nil {
- return nil, nil, err
- }
- g, err := os.Open(f1)
- if err != nil {
- return nil, nil, err
- }
- return f, g, nil
-}