aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-06 00:16:44 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-06 00:16:44 +0200
commitb5eee42e8918ab8e07684b7d15c07c443d995912 (patch)
tree084ca98166503764a5c861450c064de70be183ee
parent2f6102a0c0fba658ef664f44af5e65b007930033 (diff)
downloadrefcat-b5eee42e8918ab8e07684b7d15c07c443d995912.tar.gz
refcat-b5eee42e8918ab8e07684b7d15c07c443d995912.zip
wip: improve reduce performance
-rw-r--r--skate/reduce.go58
1 files changed, 8 insertions, 50 deletions
diff --git a/skate/reduce.go b/skate/reduce.go
index c941a67..c5da99e 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -23,38 +23,14 @@ import (
"log"
"sort"
"strings"
- "sync"
"time"
- stdjson "encoding/json"
-
"git.archive.org/martin/cgraph/skate/set"
"git.archive.org/martin/cgraph/skate/xio"
"git.archive.org/martin/cgraph/skate/zipkey"
json "github.com/segmentio/encoding/json"
)
-var brefPool = sync.Pool{
- New: func() interface{} {
- var bref BiblioRef
- return bref
- },
-}
-
-var releasePool = sync.Pool{
- New: func() interface{} {
- var r Release
- return &r
- },
-}
-
-var refPool = sync.Pool{
- New: func() interface{} {
- var r Ref
- return &r
- },
-}
-
// groupLogf logs a message alongsize a serialized group for debugging.
func groupLogf(g *zipkey.Group, s string, vs ...interface{}) {
log.Printf(s, vs...)
@@ -66,22 +42,15 @@ func groupLogf(g *zipkey.Group, s string, vs ...interface{}) {
// match result, e.g. for doi matches.
func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) error {
var (
- enc = xio.NewSafeEncoder(stdjson.NewEncoder(w))
+ enc = json.NewEncoder(xio.NewSingleWriter(w))
keyer = makeKeyFunc("\t", 1)
- i = 0
- bref BiblioRef
batcher = zipkey.NewBatcher(func(g *zipkey.Group) error {
- i++
- if i%10000 == 0 {
- log.Printf("processed %v groups", i)
- }
var (
- target = releasePool.Get().(*Release)
- ref = refPool.Get().(*Ref)
+ target *Release
+ ref *Ref
+ bref BiblioRef
err error
)
- defer releasePool.Put(target)
- defer refPool.Put(ref)
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
@@ -94,8 +63,6 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
groupLogf(g, "[skip] failed to parse ref: %v", err)
continue
}
- bref = brefPool.Get().(BiblioRef)
- defer brefPool.Put(bref)
bref.Reset()
bref.SourceReleaseIdent = ref.ReleaseIdent
bref.SourceWorkIdent = ref.WorkIdent
@@ -115,7 +82,7 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
return nil
})
)
- batcher.Size = 50000 // hard-code for now
+ batcher.Size = 10000 // hard-code for now
defer batcher.Close()
zipper := zipkey.New(releases, refs, keyer, batcher.GroupFunc)
return zipper.Run()
@@ -128,7 +95,6 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
i = 0
- bref BiblioRef
grouper = func(g *zipkey.Group) error {
i++
if i%10000 == 0 {
@@ -153,8 +119,7 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W
if target.WorkID == "" {
continue
}
- bref = brefPool.Get().(BiblioRef)
- bref.Reset()
+ var bref BiblioRef
bref.SourceReleaseIdent = re.Ident
bref.SourceWorkIdent = re.WorkID
bref.SourceReleaseStage = re.ReleaseStage
@@ -168,7 +133,6 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W
if err := enc.Encode(bref); err != nil {
return err
}
- brefPool.Put(bref)
}
return nil
}
@@ -183,7 +147,6 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
var (
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
- bref BiblioRef
grouper = func(g *zipkey.Group) error {
var (
target *Release
@@ -200,8 +163,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
if wiki, err = parseWiki(Cut(line, 2)); err != nil {
return err
}
- bref = brefPool.Get().(BiblioRef)
- bref.Reset()
+ var bref BiblioRef
bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
bref.SourceWikipediaArticle = wiki.PageTitle
bref.TargetReleaseIdent = target.Ident
@@ -212,7 +174,6 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
if err := enc.Encode(bref); err != nil {
return err
}
- brefPool.Put(bref)
}
return nil
}
@@ -311,7 +272,6 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
var (
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
- bref BiblioRef
grouper = func(g *zipkey.Group) error {
var (
ref, pivot *Release // ref (reference), pivot (open library)
@@ -334,8 +294,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
result := Verify(pivot, ref)
switch result.Status {
case StatusExact, StatusStrong:
- bref = brefPool.Get().(BiblioRef)
- bref.Reset()
+ var bref BiblioRef
bref.SourceReleaseIdent = ref.Ident
bref.SourceWorkIdent = ref.WorkID
bref.SourceReleaseStage = ref.ReleaseStage
@@ -349,7 +308,6 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
if err := enc.Encode(bref); err != nil {
return err
}
- brefPool.Put(bref)
default:
}
}