diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-09 11:47:02 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-09 11:47:02 +0200 |
commit | d2931da01a5c7d7254b5c7f4e4f8c1fa20513235 (patch) | |
tree | 7ea932a3c075b79fd11b787846057b5256c36c20 /skate | |
parent | e8fde7995d5f3c57baa8594fc5414a0fd303c025 (diff) | |
parent | 9e1c8ced8063d56ddd10683903e388374f0e8362 (diff) | |
download | refcat-d2931da01a5c7d7254b5c7f4e4f8c1fa20513235.tar.gz refcat-d2931da01a5c7d7254b5c7f4e4f8c1fa20513235.zip |
Merge branch 'master' of git.archive.org:martin/cgraph
* 'master' of git.archive.org:martin/cgraph:
update task UnmatchedRefs
stub: abbrev
update docs
remove comments [vim]
update docs
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-reduce/main.go | 26 | ||||
-rw-r--r-- | skate/map.go | 7 | ||||
-rw-r--r-- | skate/zipkey/zipkey.go | 2 |
3 files changed, 18 insertions, 17 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index df72ef4..d0cc0e3 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -1,24 +1,25 @@ // skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster) // and applies various verification and conversion functions. The output will -// often be the biblioref schema. +// often be a biblioref schema stream. // -// Support various modes. +// Support various modes, e.g. exact, verify, ref, bref, wiki. Each mode may +// work on one or two files, and may need extra args. // -// * exact: takes (key, doc) TSV files (one for releases, one for refs) and -// will emit biblioref docs relating one element from releases with all -// elements from ref; this is for "doi", "pmid" and other id matches, where no -// further checks are necessary. The match reason, e.g. "doi" needs to be -// supplied. +// * exact: takes two (key, doc) TSV files (one for releases, one for refs) and +// will emit biblioref docs relating *one* element from releases with *all* +// elements from ref; this is for "doi", "pmid" and other id matches, where no +// further checks are necessary. The match reason, e.g. "doi" needs to be +// supplied. // // $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv // -// * verify: takes (key, doc) TSV files (one for release, one for refs), runs -// verification within a group and will emit biblioref. +// * verify: takes two (key, doc) TSV files (one for release, one for refs), +// runs verification within a group and will emit biblioref. // // $ skate-reduce -m verify -F a.tsv -L b.tsv // // * ref: takes a single file with clusters containing releases and refs and -// will emit verification results. +// will emit verification results. // // $ skate-reduce -m ref < a.ndj // @@ -48,13 +49,12 @@ import ( var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 10000, "batch size") - // Each mode may work on one or two files, and may need extra args. - mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki") + mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file") memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)") - // Possible inputs, we could switch to a subcommand cli parser. + // Possible inputs -- we could switch to a subcommand cli parser? refs = flag.String("F", "", "path to refs input") releases = flag.String("L", "", "path to release input") wiki = flag.String("W", "", "path to wiki input") diff --git a/skate/map.go b/skate/map.go index 356839c..cf2933e 100644 --- a/skate/map.go +++ b/skate/map.go @@ -181,9 +181,10 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { return fields, nil } -// MapperURLFromRef extracts the work, release ident, url and doc. Previously: -// parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, .release_ident, -// .biblio.url?] | @tsv'" ... +// MapperURLFromRef extracts the (work ident, release ident, url, doc). +// Previously: parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, +// .release_ident, .biblio.url?] | @tsv'" ... +// This implementation seems slightly faster that jq and parallel. func MapperURLFromRef(p []byte) (fields [][]byte, err error) { var ref Ref if err = json.Unmarshal(p, &ref); err != nil { diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go index eb3dc55..aa49983 100644 --- a/skate/zipkey/zipkey.go +++ b/skate/zipkey/zipkey.go @@ -1,5 +1,5 @@ // Package zipkey implements ZipRun, a type that allows to attach a callback to -// a group of elements taken from two streams. +// a group of elements with a shared key taken from two streams. package zipkey import ( |