aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-verify
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-21 01:17:38 +0100
committerMartin Czygan <martin.czygan@gmail.com>2021-03-21 01:17:38 +0100
commit09a7e8c9d013f13a1aa1ef4e9b7f397647b79967 (patch)
tree122b474e27afbc66cba1182e983ef5c8555ed12f /skate/cmd/skate-verify
parenta7e0cf191ebf8fb499e0ab9a3b6cae45727f1286 (diff)
downloadrefcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.tar.gz
refcat-09a7e8c9d013f13a1aa1ef4e9b7f397647b79967.zip
initial import of skate
Diffstat (limited to 'skate/cmd/skate-verify')
-rw-r--r--skate/cmd/skate-verify/main.go140
1 files changed, 140 insertions, 0 deletions
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
new file mode 100644
index 0000000..e6fc417
--- /dev/null
+++ b/skate/cmd/skate-verify/main.go
@@ -0,0 +1,140 @@
+// Generate pairs and run verification on larger number of records. Mimick
+// fuzzycat.verify, but make it faster (e.g. fuzzycat took about 50h for the
+// complete set).
+//
+// Currently: about 2h for 40M clusters (in "ref" mode).
+//
+// XXX: Cleanup inconsistent "modes".
+package main
+
+import (
+ "bufio"
+ "flag"
+ "log"
+ "os"
+ "runtime"
+ "runtime/pprof"
+ "strings"
+
+ jsoniter "github.com/json-iterator/go"
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 10000, "batch size")
+ mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip")
+ exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv")
+ provenance = flag.String("p", "join", "provenance info")
+ releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)")
+ refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)")
+ cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
+ memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
+
+ json = jsoniter.ConfigCompatibleWithStandardLibrary
+
+ // XXX: This should be cleanup up soon.
+ matchResults = map[string]skate.MatchResult{
+ "doi": skate.MatchResult{skate.StatusExact, skate.ReasonDOI},
+ "pmid": skate.MatchResult{skate.StatusExact, skate.ReasonPMID},
+ "pmcid": skate.MatchResult{skate.StatusExact, skate.ReasonPMCID},
+ "arxiv": skate.MatchResult{skate.StatusExact, skate.ReasonArxiv},
+ "unknown": skate.MatchResult{skate.StatusUnknown, skate.ReasonUnknown},
+ }
+)
+
+func main() {
+ flag.Parse()
+ if *cpuProfile != "" {
+ file, err := os.Create(*cpuProfile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ pprof.StartCPUProfile(file)
+ defer pprof.StopCPUProfile()
+ }
+ switch *mode {
+ case "exact":
+ // Fixed zip mode for DOI.
+ if *refsFile == "" || *releasesFile == "" {
+ log.Fatal("mode requires -R and -F to be set")
+ }
+ if *exactReason == "" {
+ var keys []string
+ for k := range matchResults {
+ keys = append(keys, k)
+ }
+ log.Fatalf("need a reason for the record, one of: %s", strings.Join(keys, ", "))
+ }
+ f, err := os.Open(*releasesFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ g, err := os.Open(*refsFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer g.Close()
+ bw := bufio.NewWriter(os.Stdout)
+ defer bw.Flush()
+ mr, ok := matchResults[*exactReason]
+ if !ok {
+ mr = matchResults["unknown"]
+ }
+ if err := skate.ZipUnverified(f, g, mr, *provenance, bw); err != nil {
+ log.Fatal(err)
+ }
+ case "zip":
+ // Take two "sorted key files" (one refs, one releases) and run
+ // verification across groups, generate biblioref file.
+ if *refsFile == "" || *releasesFile == "" {
+ log.Fatal("zip mode requires -R and -F to be set")
+ }
+ f, err := os.Open(*releasesFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ g, err := os.Open(*refsFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer g.Close()
+ bw := bufio.NewWriter(os.Stdout)
+ defer bw.Flush()
+ if err := skate.ZipVerifyRefs(f, g, bw); err != nil {
+ log.Fatal(err)
+ }
+ case "ref":
+ // https://git.io/JtACz
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefCluster)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+ case "bref":
+ // generate biblioref
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+ default:
+ log.Fatal("not implemented, only: zip, ref, bref")
+ }
+ if *memProfile != "" {
+ f, err := os.Create(*memProfile)
+ if err != nil {
+ log.Fatal("could not create memory profile: ", err)
+ }
+ defer f.Close()
+ runtime.GC()
+ if err := pprof.WriteHeapProfile(f); err != nil {
+ log.Fatal(err)
+ }
+ }
+}