From 5ae497d89d2eb0fae6a6016a21390a8d38f2b83d Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Mon, 10 May 2021 23:20:27 +0200 Subject: reduce: docs and simplifications --- skate/cmd/skate-map/main.go | 6 ++-- skate/cmd/skate-reduce/main.go | 81 ++++++++++++++++-------------------------- 2 files changed, 34 insertions(+), 53 deletions(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 9bf2d14..4b30927 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -2,9 +2,9 @@ // extract a key from a json document. For simple cases, you can use `jq` and // other tools. Some key derivations require a bit more, hence a dedicated program. // -// An example with mostly unix tools. We want to extract the DOI from newline -// delimited JSON and sort by it; we also want to do this fast, hence parallel, -// LC_ALL, etc. +// An example with mostly unix tools. We want to extract (DOI, doc) tuples from +// newline delimited JSON and sort by it; we also want to do this fast, hence +// parallel, LC_ALL, etc. // // $ zstdcat -T0 file.zst | (1) // LC_ALL=C tr -d '\t' | (2) * diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index d0cc0e3..959dd87 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -1,35 +1,39 @@ // skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster) // and applies various verification and conversion functions. The output will -// often be a biblioref schema stream. +// often be a stream of biblioref schema docs. // -// Support various modes, e.g. exact, verify, ref, bref, wiki. Each mode may +// Support various "modes", e.g. exact, verify, ref, bref, wiki. Each mode may // work on one or two files, and may need extra args. // -// * exact: takes two (key, doc) TSV files (one for releases, one for refs) and -// will emit biblioref docs relating *one* element from releases with *all* -// elements from ref; this is for "doi", "pmid" and other id matches, where no -// further checks are necessary. The match reason, e.g. "doi" needs to be -// supplied. -// -// $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv -// -// * verify: takes two (key, doc) TSV files (one for release, one for refs), -// runs verification within a group and will emit biblioref. -// -// $ skate-reduce -m verify -F a.tsv -L b.tsv -// -// * ref: takes a single file with clusters containing releases and refs and -// will emit verification results. -// -// $ skate-reduce -m ref < a.ndj -// -// * bref: same as ref, but generate a biblioref file as output -// -// $ skate-reduce -m bref < a.ndj -// -// * wiki: zippy mode for releases and wikipedia inputs. -// -// $ skate-reduce -m wiki -L a.ndj -W b.ndj +// * exact | takes two (key, doc) TSV files (one for releases, one for refs) and +// | will emit biblioref docs relating *one* element from releases with *all* +// | elements from ref; this is for "doi", "pmid" and other id matches, where no +// | further checks are necessary. The match reason, e.g. "doi" needs to be +// | supplied. +// | +// | $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv +// | +// | +// * verify | takes two (key, doc) TSV files (one for release, one for refs), +// | runs verification within a group and will emit biblioref. +// | +// | $ skate-reduce -m verify -F a.tsv -L b.tsv +// | +// | +// * ref | takes a single file with clusters containing releases and refs and +// | will emit verification results. +// | +// | $ skate-reduce -m ref < a.ndj +// | +// | +// * bref | same as ref, but generate a biblioref file as output +// | +// | $ skate-reduce -m bref < a.ndj +// | +// | +// * wiki | zippy mode for releases and wikipedia inputs. +// | +// | $ skate-reduce -m wiki -L a.ndj -W b.ndj // package main @@ -39,7 +43,6 @@ import ( "log" "os" "runtime" - "runtime/pprof" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" @@ -51,9 +54,6 @@ var ( batchSize = flag.Int("b", 10000, "batch size") mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki") - cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file") - memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)") - // Possible inputs -- we could switch to a subcommand cli parser? refs = flag.String("F", "", "path to refs input") releases = flag.String("L", "", "path to release input") @@ -72,14 +72,6 @@ var ( func main() { flag.Parse() - if *cpuProfile != "" { - file, err := os.Create(*cpuProfile) - if err != nil { - log.Fatal(err) - } - pprof.StartCPUProfile(file) - defer pprof.StopCPUProfile() - } bw := bufio.NewWriter(os.Stdout) defer bw.Flush() @@ -130,15 +122,4 @@ func main() { default: log.Fatalf("invalid mode") } - if *memProfile != "" { - f, err := os.Create(*memProfile) - if err != nil { - log.Fatal("could not create memory profile: ", err) - } - defer f.Close() - runtime.GC() - if err := pprof.WriteHeapProfile(f); err != nil { - log.Fatal(err) - } - } } -- cgit v1.2.3