diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-06 00:16:44 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-06 00:16:44 +0200 |
commit | b5eee42e8918ab8e07684b7d15c07c443d995912 (patch) | |
tree | 084ca98166503764a5c861450c064de70be183ee | |
parent | 2f6102a0c0fba658ef664f44af5e65b007930033 (diff) | |
download | refcat-b5eee42e8918ab8e07684b7d15c07c443d995912.tar.gz refcat-b5eee42e8918ab8e07684b7d15c07c443d995912.zip |
wip: improve reduce performance
-rw-r--r-- | skate/reduce.go | 58 |
1 files changed, 8 insertions, 50 deletions
diff --git a/skate/reduce.go b/skate/reduce.go index c941a67..c5da99e 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -23,38 +23,14 @@ import ( "log" "sort" "strings" - "sync" "time" - stdjson "encoding/json" - "git.archive.org/martin/cgraph/skate/set" "git.archive.org/martin/cgraph/skate/xio" "git.archive.org/martin/cgraph/skate/zipkey" json "github.com/segmentio/encoding/json" ) -var brefPool = sync.Pool{ - New: func() interface{} { - var bref BiblioRef - return bref - }, -} - -var releasePool = sync.Pool{ - New: func() interface{} { - var r Release - return &r - }, -} - -var refPool = sync.Pool{ - New: func() interface{} { - var r Ref - return &r - }, -} - // groupLogf logs a message alongsize a serialized group for debugging. func groupLogf(g *zipkey.Group, s string, vs ...interface{}) { log.Printf(s, vs...) @@ -66,22 +42,15 @@ func groupLogf(g *zipkey.Group, s string, vs ...interface{}) { // match result, e.g. for doi matches. func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) error { var ( - enc = xio.NewSafeEncoder(stdjson.NewEncoder(w)) + enc = json.NewEncoder(xio.NewSingleWriter(w)) keyer = makeKeyFunc("\t", 1) - i = 0 - bref BiblioRef batcher = zipkey.NewBatcher(func(g *zipkey.Group) error { - i++ - if i%10000 == 0 { - log.Printf("processed %v groups", i) - } var ( - target = releasePool.Get().(*Release) - ref = refPool.Get().(*Ref) + target *Release + ref *Ref + bref BiblioRef err error ) - defer releasePool.Put(target) - defer refPool.Put(ref) if len(g.G0) == 0 || len(g.G1) == 0 { return nil } @@ -94,8 +63,6 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) groupLogf(g, "[skip] failed to parse ref: %v", err) continue } - bref = brefPool.Get().(BiblioRef) - defer brefPool.Put(bref) bref.Reset() bref.SourceReleaseIdent = ref.ReleaseIdent bref.SourceWorkIdent = ref.WorkIdent @@ -115,7 +82,7 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) return nil }) ) - batcher.Size = 50000 // hard-code for now + batcher.Size = 10000 // hard-code for now defer batcher.Close() zipper := zipkey.New(releases, refs, keyer, batcher.GroupFunc) return zipper.Run() @@ -128,7 +95,6 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) i = 0 - bref BiblioRef grouper = func(g *zipkey.Group) error { i++ if i%10000 == 0 { @@ -153,8 +119,7 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W if target.WorkID == "" { continue } - bref = brefPool.Get().(BiblioRef) - bref.Reset() + var bref BiblioRef bref.SourceReleaseIdent = re.Ident bref.SourceWorkIdent = re.WorkID bref.SourceReleaseStage = re.ReleaseStage @@ -168,7 +133,6 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W if err := enc.Encode(bref); err != nil { return err } - brefPool.Put(bref) } return nil } @@ -183,7 +147,6 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) - bref BiblioRef grouper = func(g *zipkey.Group) error { var ( target *Release @@ -200,8 +163,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error if wiki, err = parseWiki(Cut(line, 2)); err != nil { return err } - bref = brefPool.Get().(BiblioRef) - bref.Reset() + var bref BiblioRef bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? bref.SourceWikipediaArticle = wiki.PageTitle bref.TargetReleaseIdent = target.Ident @@ -212,7 +174,6 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error if err := enc.Encode(bref); err != nil { return err } - brefPool.Put(bref) } return nil } @@ -311,7 +272,6 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) - bref BiblioRef grouper = func(g *zipkey.Group) error { var ( ref, pivot *Release // ref (reference), pivot (open library) @@ -334,8 +294,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { result := Verify(pivot, ref) switch result.Status { case StatusExact, StatusStrong: - bref = brefPool.Get().(BiblioRef) - bref.Reset() + var bref BiblioRef bref.SourceReleaseIdent = ref.Ident bref.SourceWorkIdent = ref.WorkID bref.SourceReleaseStage = ref.ReleaseStage @@ -349,7 +308,6 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { if err := enc.Encode(bref); err != nil { return err } - brefPool.Put(bref) default: } } |