aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-reduce
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-06 19:47:35 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-06 19:47:35 +0200
commitb46c9d351aa0a2a5c3618a1420259d4605a9654e (patch)
tree376f55f59baf657397210b2a2a3724b9ba61f6bb /skate/cmd/skate-reduce
parentfb627564751d170c55b45785da569268b6290be9 (diff)
downloadrefcat-b46c9d351aa0a2a5c3618a1420259d4605a9654e.tar.gz
refcat-b46c9d351aa0a2a5c3618a1420259d4605a9654e.zip
move skate-verify to skate-reduce
Diffstat (limited to 'skate/cmd/skate-reduce')
-rw-r--r--skate/cmd/skate-reduce/main.go144
1 files changed, 144 insertions, 0 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
new file mode 100644
index 0000000..df72ef4
--- /dev/null
+++ b/skate/cmd/skate-reduce/main.go
@@ -0,0 +1,144 @@
+// skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster)
+// and applies various verification and conversion functions. The output will
+// often be the biblioref schema.
+//
+// Support various modes.
+//
+// * exact: takes (key, doc) TSV files (one for releases, one for refs) and
+// will emit biblioref docs relating one element from releases with all
+// elements from ref; this is for "doi", "pmid" and other id matches, where no
+// further checks are necessary. The match reason, e.g. "doi" needs to be
+// supplied.
+//
+// $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv
+//
+// * verify: takes (key, doc) TSV files (one for release, one for refs), runs
+// verification within a group and will emit biblioref.
+//
+// $ skate-reduce -m verify -F a.tsv -L b.tsv
+//
+// * ref: takes a single file with clusters containing releases and refs and
+// will emit verification results.
+//
+// $ skate-reduce -m ref < a.ndj
+//
+// * bref: same as ref, but generate a biblioref file as output
+//
+// $ skate-reduce -m bref < a.ndj
+//
+// * wiki: zippy mode for releases and wikipedia inputs.
+//
+// $ skate-reduce -m wiki -L a.ndj -W b.ndj
+//
+package main
+
+import (
+ "bufio"
+ "flag"
+ "log"
+ "os"
+ "runtime"
+ "runtime/pprof"
+
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+ "git.archive.org/martin/cgraph/skate/xio"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 10000, "batch size")
+ // Each mode may work on one or two files, and may need extra args.
+ mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki")
+
+ cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
+ memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
+
+ // Possible inputs, we could switch to a subcommand cli parser.
+ refs = flag.String("F", "", "path to refs input")
+ releases = flag.String("L", "", "path to release input")
+ wiki = flag.String("W", "", "path to wiki input")
+
+ // Extra args.
+ reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown")
+ reasonMap = map[string]skate.MatchResult{
+ "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI},
+ "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID},
+ "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID},
+ "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv},
+ "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown},
+ }
+)
+
+func main() {
+ flag.Parse()
+ if *cpuProfile != "" {
+ file, err := os.Create(*cpuProfile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ pprof.StartCPUProfile(file)
+ defer pprof.StopCPUProfile()
+ }
+
+ bw := bufio.NewWriter(os.Stdout)
+ defer bw.Flush()
+
+ switch *mode {
+ case "exact":
+ l, f, err := xio.OpenTwo(*releases, *refs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ r, ok := reasonMap[*reason]
+ if !ok {
+ log.Fatalf("unknown reason: %v", *reason)
+ }
+ if err := skate.ZippyExact(l, f, r, bw); err != nil {
+ log.Fatal(err)
+ }
+ case "verify":
+ l, f, err := xio.OpenTwo(*releases, *refs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if err := skate.ZippyVerifyRefs(l, f, bw); err != nil {
+ log.Fatal(err)
+ }
+ case "ref":
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+ case "bref":
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+ case "wiki":
+ l, w, err := xio.OpenTwo(*releases, *wiki)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil {
+ log.Fatal(err)
+ }
+ default:
+ log.Fatalf("invalid mode")
+ }
+ if *memProfile != "" {
+ f, err := os.Create(*memProfile)
+ if err != nil {
+ log.Fatal("could not create memory profile: ", err)
+ }
+ defer f.Close()
+ runtime.GC()
+ if err := pprof.WriteHeapProfile(f); err != nil {
+ log.Fatal(err)
+ }
+ }
+}