reduce: docs and simplifications

author: Martin Czygan <martin.czygan@gmail.com> 2021-05-10 23:20:27 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-05-10 23:20:27 +0200
commit: 5ae497d89d2eb0fae6a6016a21390a8d38f2b83d (patch)
tree: 35bd76c557f610e98338c1a31d3dfc2bd1fb14a6 /skate/cmd
parent: 521f4fdfa3db52043686fdce232f402b468362ff (diff)
download: refcat-5ae497d89d2eb0fae6a6016a21390a8d38f2b83d.tar.gz
refcat-5ae497d89d2eb0fae6a6016a21390a8d38f2b83d.zip
2 files changed, 34 insertions, 53 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 9bf2d14..4b30927 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -2,9 +2,9 @@
 // extract a key from a json document. For simple cases, you can use `jq` and
 // other tools. Some key derivations require a bit more, hence a dedicated program.
 //
-// An example with mostly unix tools. We want to extract the DOI from newline
-// delimited JSON and sort by it; we also want to do this fast, hence parallel,
-// LC_ALL, etc.
+// An example with mostly unix tools. We want to extract (DOI, doc) tuples from
+// newline delimited JSON and sort by it; we also want to do this fast, hence
+// parallel, LC_ALL, etc.
 //
 // $ zstdcat -T0 file.zst |                                  (1)
 //     LC_ALL=C tr -d '\t' |                                 (2) *
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index d0cc0e3..959dd87 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -1,35 +1,39 @@
 // skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster)
 // and applies various verification and conversion functions. The output will
-// often be a biblioref schema stream.
+// often be a stream of biblioref schema docs.
 //
-// Support various modes, e.g. exact, verify, ref, bref, wiki.  Each mode may
+// Support various "modes", e.g. exact, verify, ref, bref, wiki. Each mode may
 // work on one or two files, and may need extra args.
 //
-// * exact: takes two (key, doc) TSV files (one for releases, one for refs) and
-//          will emit biblioref docs relating *one* element from releases with *all*
-//          elements from ref; this is for "doi", "pmid" and other id matches, where no
-//          further checks are necessary. The match reason, e.g. "doi" needs to be
-//          supplied.
-//
-//     $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv
-//
-// * verify: takes two (key, doc) TSV files (one for release, one for refs),
-//           runs verification within a group and will emit biblioref.
-//
-//     $ skate-reduce -m verify -F a.tsv -L b.tsv
-//
-// * ref: takes a single file with clusters containing releases and refs and
-//        will emit verification results.
-//
-//     $ skate-reduce -m ref < a.ndj
-//
-// * bref: same as ref, but generate a biblioref file as output
-//
-//     $ skate-reduce -m bref < a.ndj
-//
-// * wiki: zippy mode for releases and wikipedia inputs.
-//
-//     $ skate-reduce -m wiki -L a.ndj -W b.ndj
+// * exact  | takes two (key, doc) TSV files (one for releases, one for refs) and
+//          | will emit biblioref docs relating *one* element from releases with *all*
+//          | elements from ref; this is for "doi", "pmid" and other id matches, where no
+//          | further checks are necessary. The match reason, e.g. "doi" needs to be
+//          | supplied.
+//          |
+//          | $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv
+//          |
+//          |
+// * verify | takes two (key, doc) TSV files (one for release, one for refs),
+//          | runs verification within a group and will emit biblioref.
+//          |
+//          | $ skate-reduce -m verify -F a.tsv -L b.tsv
+//          |
+//          |
+// * ref    | takes a single file with clusters containing releases and refs and
+//          | will emit verification results.
+//          |
+//          | $ skate-reduce -m ref < a.ndj
+//          |
+//          |
+// * bref   | same as ref, but generate a biblioref file as output
+//          |
+//          | $ skate-reduce -m bref < a.ndj
+//          |
+//          |
+// * wiki   | zippy mode for releases and wikipedia inputs.
+//          |
+//          | $ skate-reduce -m wiki -L a.ndj -W b.ndj
 //
 package main
 
@@ -39,7 +43,6 @@ import (
 	"log"
 	"os"
 	"runtime"
-	"runtime/pprof"
 
 	"git.archive.org/martin/cgraph/skate"
 	"git.archive.org/martin/cgraph/skate/parallel"
@@ -51,9 +54,6 @@ var (
 	batchSize  = flag.Int("b", 10000, "batch size")
 	mode       = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki")
 
-	cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
-	memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
-
 	// Possible inputs -- we could switch to a subcommand cli parser?
 	refs     = flag.String("F", "", "path to refs input")
 	releases = flag.String("L", "", "path to release input")
@@ -72,14 +72,6 @@ var (
 
 func main() {
 	flag.Parse()
-	if *cpuProfile != "" {
-		file, err := os.Create(*cpuProfile)
-		if err != nil {
-			log.Fatal(err)
-		}
-		pprof.StartCPUProfile(file)
-		defer pprof.StopCPUProfile()
-	}
 
 	bw := bufio.NewWriter(os.Stdout)
 	defer bw.Flush()
@@ -130,15 +122,4 @@ func main() {
 	default:
 		log.Fatalf("invalid mode")
 	}
-	if *memProfile != "" {
-		f, err := os.Create(*memProfile)
-		if err != nil {
-			log.Fatal("could not create memory profile: ", err)
-		}
-		defer f.Close()
-		runtime.GC()
-		if err := pprof.WriteHeapProfile(f); err != nil {
-			log.Fatal(err)
-		}
-	}
 }
author	Martin Czygan <martin.czygan@gmail.com>	2021-05-10 23:20:27 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-05-10 23:20:27 +0200
commit	5ae497d89d2eb0fae6a6016a21390a8d38f2b83d (patch)
tree	35bd76c557f610e98338c1a31d3dfc2bd1fb14a6 /skate/cmd
parent	521f4fdfa3db52043686fdce232f402b468362ff (diff)
download	refcat-5ae497d89d2eb0fae6a6016a21390a8d38f2b83d.tar.gz refcat-5ae497d89d2eb0fae6a6016a21390a8d38f2b83d.zip