aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-14 22:59:28 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-14 22:59:28 +0200
commit902ae78beda424f203db8b8f59cd5e54516475af (patch)
treeb16eb99cb7b05fda875598d47dde03fcd3f8e59a /skate
parente99f54c2e653f0662ba0823fa3ac4bccc475dce1 (diff)
downloadrefcat-902ae78beda424f203db8b8f59cd5e54516475af.tar.gz
refcat-902ae78beda424f203db8b8f59cd5e54516475af.zip
update docs
Diffstat (limited to 'skate')
-rw-r--r--skate/reduce.go15
-rw-r--r--skate/zipkey/batch.go7
2 files changed, 11 insertions, 11 deletions
diff --git a/skate/reduce.go b/skate/reduce.go
index d093f5a..df96076 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -1,5 +1,5 @@
-// This file contains various "reducers", e.g. merging data from two streams and
-// applying a function on groups of documents with a shared key.
+// This file contains various "reducers", which e.g. merge data from two
+// streams and apply a function on groups of documents with a shared key.
//
// Note: This is a bit repetitive, but we do not want to introduce any other
// abstraction for now. Since most of the logic is in the "grouper" functions,
@@ -7,10 +7,10 @@
// the fly.
//
// The most confusing aspect currently is the variety of schemas hidden within
-// the readers (and string groups): release, ref, ref-as-release, open library,
-// wikipedia, ...
+// the readers (and string groups): release, ref, biblioref, csl,
+// ref-as-release, open library, wikipedia, ...
//
-// We call the biblioref schema sometimes just bref.
+// We call the biblioref schema sometimes just bref, for short.
//
// TODO:
// * [ ] pass release stage through all match types
@@ -89,7 +89,7 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
}
return nil
}
- batcher = zipkey.NewBatcher(grouper) // hard-code for now; on 24 cores 10K take up over 8G of RAM
+ batcher = zipkey.NewBatcher(grouper)
)
defer batcher.Close()
zipper := zipkey.New(releases, refs, keyer, batcher.GroupFunc)
@@ -180,7 +180,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
// We use lowercase base32 w/o padding of the original
// PageTitle as component for the id. XXX: ok for now?
if wiki.Language == "" {
- lang = "en"
+ lang = "en" // XXX: We currently only use "en" subset.
} else {
lang = wiki.Language
}
@@ -191,7 +191,6 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
}
seen.Add(key)
bref.Key = key
- // XXX: We currently only use "en" subset.
bref.SourceWikipediaArticle = fmt.Sprintf("%s:%s", lang, wiki.PageTitle)
bref.TargetReleaseIdent = target.Ident
bref.TargetWorkIdent = target.WorkID
diff --git a/skate/zipkey/batch.go b/skate/zipkey/batch.go
index 6ab7eee..56f6f0d 100644
--- a/skate/zipkey/batch.go
+++ b/skate/zipkey/batch.go
@@ -22,7 +22,7 @@ type Batcher struct {
closing bool // https://stackoverflow.com/q/16105325/89391
}
-// NewBatcher set ups a new Batcher with a batch size of 1000.
+// NewBatcher set ups a new Batcher with a default batch size.
func NewBatcher(gf groupFunc) *Batcher {
return NewBatcherSize(gf, defaultBatchSize)
}
@@ -43,7 +43,8 @@ func NewBatcherSize(gf groupFunc, size int) *Batcher {
}
// Close tears down the batcher. If this is not called, you get goroutine leaks
-// and will miss the data from the last uncommitted batch.
+// and will miss the data from the last uncommitted batch. Calling this
+// function more than once will result in a panic.
func (b *Batcher) Close() error {
b.closing = true
g := make([]*Group, len(b.batch))
@@ -56,7 +57,7 @@ func (b *Batcher) Close() error {
}
// GroupFunc is a drop-in for a groupFunc. Use this function, where you used
-// grouper before. Not thread safe. Panics if called after Close.
+// groupFunc before. Not thread safe. Panics if called after Close.
func (b *Batcher) GroupFunc(g *Group) error {
if b.closing {
panic("cannot call GroupFunc after Close")