diff options
Diffstat (limited to 'skate/cmd/skate-verify/main.go')
-rw-r--r-- | skate/cmd/skate-verify/main.go | 158 |
1 files changed, 0 insertions, 158 deletions
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go deleted file mode 100644 index 1288404..0000000 --- a/skate/cmd/skate-verify/main.go +++ /dev/null @@ -1,158 +0,0 @@ -// Generate pairs and run verification on larger number of records. Mimick -// fuzzycat.verify, but make it faster (e.g. fuzzycat took about 50h for the -// complete set). -// -// Currently: about 2h for 40M clusters (in "ref" mode). -// -// XXX: Cleanup inconsistent "modes". -package main - -import ( - "bufio" - "flag" - "io" - "log" - "os" - "runtime" - "runtime/pprof" - "strings" - - "git.archive.org/martin/cgraph/skate" - "git.archive.org/martin/cgraph/skate/parallel" -) - -var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 10000, "batch size") - mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip, wiki") - exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv") - provenance = flag.String("p", "join", "provenance info") - wikiFile = flag.String("W", "", "wiki citation file") - releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)") - refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)") - cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file") - memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)") - - // XXX: This should be cleanup up soon. - matchResults = map[string]skate.MatchResult{ - "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI}, - "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID}, - "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID}, - "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv}, - "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown}, - } -) - -func main() { - flag.Parse() - if *cpuProfile != "" { - file, err := os.Create(*cpuProfile) - if err != nil { - log.Fatal(err) - } - pprof.StartCPUProfile(file) - defer pprof.StopCPUProfile() - } - var ( - f, g io.ReadCloser - err error - bw = bufio.NewWriter(os.Stdout) - ) - defer bw.Flush() - switch *mode { - case "exact": - // Fixed zip mode for DOI. - if *refsFile == "" || *releasesFile == "" { - log.Fatal("mode requires -R and -F to be set") - } - if *exactReason == "" { - var keys []string - for k := range matchResults { - keys = append(keys, k) - } - log.Fatalf("need a reason for the record, one of: %s", strings.Join(keys, ", ")) - } - if f, g, err = readersFromFilenames(*releasesFile, *refsFile); err != nil { - log.Fatal(err) - } - defer f.Close() - defer g.Close() - mr, ok := matchResults[*exactReason] - if !ok { - mr = matchResults["unknown"] - } - if err := skate.ZippyFixed(f, g, mr, *provenance, bw); err != nil { - log.Fatal(err) - } - case "zip": - // Take two "sorted key files" (one refs, one releases) and run - // verification across groups, generate biblioref file. - if *refsFile == "" || *releasesFile == "" { - log.Fatal("zip mode requires -F and -R to be set") - } - if f, g, err = readersFromFilenames(*releasesFile, *refsFile); err != nil { - log.Fatal(err) - } - defer f.Close() - defer g.Close() - if err := skate.ZippyVerifyRefs(f, g, bw); err != nil { - log.Fatal(err) - } - case "ref": - // https://git.io/JtACz - pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "bref": - // generate biblioref - pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "wiki": - // Fixed zip mode for DOI from wikipedia. - if *wikiFile == "" || *releasesFile == "" { - log.Fatal("mode requires -W and -F to be set") - } - if f, g, err = readersFromFilenames(*releasesFile, *wikiFile); err != nil { - log.Fatal(err) - } - defer f.Close() - defer g.Close() - mr := skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI} - if err = skate.ZippyFixedWiki(f, g, mr, "wiki", bw); err != nil { - log.Fatal(err) - } - default: - log.Fatal("not implemented, only: exact, zip, ref, bref, wiki") - } - if *memProfile != "" { - f, err := os.Create(*memProfile) - if err != nil { - log.Fatal("could not create memory profile: ", err) - } - defer f.Close() - runtime.GC() - if err := pprof.WriteHeapProfile(f); err != nil { - log.Fatal(err) - } - } -} - -// readersFromFilenames lets the called check for a single error only. -func readersFromFilenames(f0, f1 string) (io.ReadCloser, io.ReadCloser, error) { - f, err := os.Open(f0) - if err != nil { - return nil, nil, err - } - g, err := os.Open(f1) - if err != nil { - return nil, nil, err - } - return f, g, nil -} |