diff options
-rw-r--r-- | skate/reduce.go | 15 | ||||
-rw-r--r-- | skate/zipkey/batch.go | 7 |
2 files changed, 11 insertions, 11 deletions
diff --git a/skate/reduce.go b/skate/reduce.go index d093f5a..df96076 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -1,5 +1,5 @@ -// This file contains various "reducers", e.g. merging data from two streams and -// applying a function on groups of documents with a shared key. +// This file contains various "reducers", which e.g. merge data from two +// streams and apply a function on groups of documents with a shared key. // // Note: This is a bit repetitive, but we do not want to introduce any other // abstraction for now. Since most of the logic is in the "grouper" functions, @@ -7,10 +7,10 @@ // the fly. // // The most confusing aspect currently is the variety of schemas hidden within -// the readers (and string groups): release, ref, ref-as-release, open library, -// wikipedia, ... +// the readers (and string groups): release, ref, biblioref, csl, +// ref-as-release, open library, wikipedia, ... // -// We call the biblioref schema sometimes just bref. +// We call the biblioref schema sometimes just bref, for short. // // TODO: // * [ ] pass release stage through all match types @@ -89,7 +89,7 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) } return nil } - batcher = zipkey.NewBatcher(grouper) // hard-code for now; on 24 cores 10K take up over 8G of RAM + batcher = zipkey.NewBatcher(grouper) ) defer batcher.Close() zipper := zipkey.New(releases, refs, keyer, batcher.GroupFunc) @@ -180,7 +180,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error // We use lowercase base32 w/o padding of the original // PageTitle as component for the id. XXX: ok for now? if wiki.Language == "" { - lang = "en" + lang = "en" // XXX: We currently only use "en" subset. } else { lang = wiki.Language } @@ -191,7 +191,6 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error } seen.Add(key) bref.Key = key - // XXX: We currently only use "en" subset. bref.SourceWikipediaArticle = fmt.Sprintf("%s:%s", lang, wiki.PageTitle) bref.TargetReleaseIdent = target.Ident bref.TargetWorkIdent = target.WorkID diff --git a/skate/zipkey/batch.go b/skate/zipkey/batch.go index 6ab7eee..56f6f0d 100644 --- a/skate/zipkey/batch.go +++ b/skate/zipkey/batch.go @@ -22,7 +22,7 @@ type Batcher struct { closing bool // https://stackoverflow.com/q/16105325/89391 } -// NewBatcher set ups a new Batcher with a batch size of 1000. +// NewBatcher set ups a new Batcher with a default batch size. func NewBatcher(gf groupFunc) *Batcher { return NewBatcherSize(gf, defaultBatchSize) } @@ -43,7 +43,8 @@ func NewBatcherSize(gf groupFunc, size int) *Batcher { } // Close tears down the batcher. If this is not called, you get goroutine leaks -// and will miss the data from the last uncommitted batch. +// and will miss the data from the last uncommitted batch. Calling this +// function more than once will result in a panic. func (b *Batcher) Close() error { b.closing = true g := make([]*Group, len(b.batch)) @@ -56,7 +57,7 @@ func (b *Batcher) Close() error { } // GroupFunc is a drop-in for a groupFunc. Use this function, where you used -// grouper before. Not thread safe. Panics if called after Close. +// groupFunc before. Not thread safe. Panics if called after Close. func (b *Batcher) GroupFunc(g *Group) error { if b.closing { panic("cannot call GroupFunc after Close") |