diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-reduce/main.go | 26 | ||||
-rw-r--r-- | skate/map.go | 7 | ||||
-rw-r--r-- | skate/zipkey/zipkey.go | 2 |
3 files changed, 18 insertions, 17 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index df72ef4..d0cc0e3 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -1,24 +1,25 @@ // skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster) // and applies various verification and conversion functions. The output will -// often be the biblioref schema. +// often be a biblioref schema stream. // -// Support various modes. +// Support various modes, e.g. exact, verify, ref, bref, wiki. Each mode may +// work on one or two files, and may need extra args. // -// * exact: takes (key, doc) TSV files (one for releases, one for refs) and -// will emit biblioref docs relating one element from releases with all -// elements from ref; this is for "doi", "pmid" and other id matches, where no -// further checks are necessary. The match reason, e.g. "doi" needs to be -// supplied. +// * exact: takes two (key, doc) TSV files (one for releases, one for refs) and +// will emit biblioref docs relating *one* element from releases with *all* +// elements from ref; this is for "doi", "pmid" and other id matches, where no +// further checks are necessary. The match reason, e.g. "doi" needs to be +// supplied. // // $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv // -// * verify: takes (key, doc) TSV files (one for release, one for refs), runs -// verification within a group and will emit biblioref. +// * verify: takes two (key, doc) TSV files (one for release, one for refs), +// runs verification within a group and will emit biblioref. // // $ skate-reduce -m verify -F a.tsv -L b.tsv // // * ref: takes a single file with clusters containing releases and refs and -// will emit verification results. +// will emit verification results. // // $ skate-reduce -m ref < a.ndj // @@ -48,13 +49,12 @@ import ( var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 10000, "batch size") - // Each mode may work on one or two files, and may need extra args. - mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki") + mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file") memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)") - // Possible inputs, we could switch to a subcommand cli parser. + // Possible inputs -- we could switch to a subcommand cli parser? refs = flag.String("F", "", "path to refs input") releases = flag.String("L", "", "path to release input") wiki = flag.String("W", "", "path to wiki input") diff --git a/skate/map.go b/skate/map.go index 356839c..cf2933e 100644 --- a/skate/map.go +++ b/skate/map.go @@ -181,9 +181,10 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { return fields, nil } -// MapperURLFromRef extracts the work, release ident, url and doc. Previously: -// parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, .release_ident, -// .biblio.url?] | @tsv'" ... +// MapperURLFromRef extracts the (work ident, release ident, url, doc). +// Previously: parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, +// .release_ident, .biblio.url?] | @tsv'" ... +// This implementation seems slightly faster that jq and parallel. func MapperURLFromRef(p []byte) (fields [][]byte, err error) { var ref Ref if err = json.Unmarshal(p, &ref); err != nil { diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go index eb3dc55..aa49983 100644 --- a/skate/zipkey/zipkey.go +++ b/skate/zipkey/zipkey.go @@ -1,5 +1,5 @@ // Package zipkey implements ZipRun, a type that allows to attach a callback to -// a group of elements taken from two streams. +// a group of elements with a shared key taken from two streams. package zipkey import ( |