aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/cmd/skate-reduce/main.go26
-rw-r--r--skate/map.go7
-rw-r--r--skate/zipkey/zipkey.go2
3 files changed, 18 insertions, 17 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index df72ef4..d0cc0e3 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -1,24 +1,25 @@
// skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster)
// and applies various verification and conversion functions. The output will
-// often be the biblioref schema.
+// often be a biblioref schema stream.
//
-// Support various modes.
+// Support various modes, e.g. exact, verify, ref, bref, wiki. Each mode may
+// work on one or two files, and may need extra args.
//
-// * exact: takes (key, doc) TSV files (one for releases, one for refs) and
-// will emit biblioref docs relating one element from releases with all
-// elements from ref; this is for "doi", "pmid" and other id matches, where no
-// further checks are necessary. The match reason, e.g. "doi" needs to be
-// supplied.
+// * exact: takes two (key, doc) TSV files (one for releases, one for refs) and
+// will emit biblioref docs relating *one* element from releases with *all*
+// elements from ref; this is for "doi", "pmid" and other id matches, where no
+// further checks are necessary. The match reason, e.g. "doi" needs to be
+// supplied.
//
// $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv
//
-// * verify: takes (key, doc) TSV files (one for release, one for refs), runs
-// verification within a group and will emit biblioref.
+// * verify: takes two (key, doc) TSV files (one for release, one for refs),
+// runs verification within a group and will emit biblioref.
//
// $ skate-reduce -m verify -F a.tsv -L b.tsv
//
// * ref: takes a single file with clusters containing releases and refs and
-// will emit verification results.
+// will emit verification results.
//
// $ skate-reduce -m ref < a.ndj
//
@@ -48,13 +49,12 @@ import (
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 10000, "batch size")
- // Each mode may work on one or two files, and may need extra args.
- mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki")
+ mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki")
cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
- // Possible inputs, we could switch to a subcommand cli parser.
+ // Possible inputs -- we could switch to a subcommand cli parser?
refs = flag.String("F", "", "path to refs input")
releases = flag.String("L", "", "path to release input")
wiki = flag.String("W", "", "path to wiki input")
diff --git a/skate/map.go b/skate/map.go
index 356839c..cf2933e 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -181,9 +181,10 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) {
return fields, nil
}
-// MapperURLFromRef extracts the work, release ident, url and doc. Previously:
-// parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, .release_ident,
-// .biblio.url?] | @tsv'" ...
+// MapperURLFromRef extracts the (work ident, release ident, url, doc).
+// Previously: parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident,
+// .release_ident, .biblio.url?] | @tsv'" ...
+// This implementation seems slightly faster that jq and parallel.
func MapperURLFromRef(p []byte) (fields [][]byte, err error) {
var ref Ref
if err = json.Unmarshal(p, &ref); err != nil {
diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go
index eb3dc55..aa49983 100644
--- a/skate/zipkey/zipkey.go
+++ b/skate/zipkey/zipkey.go
@@ -1,5 +1,5 @@
// Package zipkey implements ZipRun, a type that allows to attach a callback to
-// a group of elements taken from two streams.
+// a group of elements with a shared key taken from two streams.
package zipkey
import (