From 6a11c515df6b9d7c707d96dac2b52098153e39f4 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 2 Jul 2021 21:44:49 +0200 Subject: reduce allocations during bref encoding --- skate/schema.go | 23 +++++++++++++++++++++++ skate/zippy.go | 30 +++++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/skate/schema.go b/skate/schema.go index 8554e14..d35272e 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -350,6 +350,29 @@ type BiblioRef struct { TargetCSL string `json:"target_csl,omitempty"` } +func (b *BiblioRef) Reset() { + b.Key = "" + b.IndexedTs = "" + b.SourceReleaseIdent = "" + b.SourceWorkIdent = "" + b.SourceWikipediaArticle = "" + b.SourceReleaseStage = "" + b.SourceYear = "" + b.RefIndex = 0 + b.RefKey = "" + b.RefLocator = "" + b.TargetReleaseIdent = "" + b.TargetWorkIdent = "" + b.TargetOpenLibraryWork = "" + b.TargetURLSurt = "" + b.TargetURL = "" + b.MatchProvenance = "" + b.MatchStatus = "" + b.MatchReason = "" + b.TargetUnstructured = "" + b.TargetCSL = "" +} + // Hash returns a string that will be the same, if source and target are // equal; different otherwise. This can be used to detect duplicate links. func (b *BiblioRef) LinkHash() string { diff --git a/skate/zippy.go b/skate/zippy.go index b69ce2b..ff836e8 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -20,6 +20,7 @@ import ( "log" "sort" "strings" + "sync" "time" "git.archive.org/martin/cgraph/skate/set" @@ -27,6 +28,13 @@ import ( json "github.com/segmentio/encoding/json" ) +var brefPool = sync.Pool{ + New: func() interface{} { + var bref BiblioRef + return bref + }, +} + // groupLogf logs a message alongsize a serialized group for debugging. func groupLogf(g *zipkey.Group, s string, vs ...interface{}) { log.Printf(s, vs...) @@ -35,12 +43,13 @@ func groupLogf(g *zipkey.Group, s string, vs ...interface{}) { } // ZippyExact takes a release and refs reader (key, doc) and assigns a fixed -// match result. +// match result, e.g. for doi matches. func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) error { var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) i = 0 + bref BiblioRef grouper = func(g *zipkey.Group) error { i++ if i%10000 == 0 { @@ -63,7 +72,8 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) groupLogf(g, "[skip] failed to parse ref: %v", err) continue } - var bref BiblioRef + bref = brefPool.Get().(BiblioRef) + bref.Reset() bref.SourceReleaseIdent = ref.ReleaseIdent bref.SourceWorkIdent = ref.WorkIdent bref.SourceReleaseStage = ref.ReleaseStage @@ -78,6 +88,7 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) if err := enc.Encode(bref); err != nil { return err } + brefPool.Put(bref) } return nil } @@ -93,6 +104,7 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) i = 0 + bref BiblioRef grouper = func(g *zipkey.Group) error { i++ if i%10000 == 0 { @@ -117,7 +129,8 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W if target.WorkID == "" { continue } - var bref BiblioRef + bref = brefPool.Get().(BiblioRef) + bref.Reset() bref.SourceReleaseIdent = re.Ident bref.SourceWorkIdent = re.WorkID bref.SourceReleaseStage = re.ReleaseStage @@ -131,6 +144,7 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W if err := enc.Encode(bref); err != nil { return err } + brefPool.Put(bref) } return nil } @@ -145,6 +159,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) + bref BiblioRef grouper = func(g *zipkey.Group) error { var ( target *Release @@ -161,7 +176,8 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error if wiki, err = parseWiki(Cut(line, 2)); err != nil { return err } - var bref BiblioRef + bref = brefPool.Get().(BiblioRef) + bref.Reset() bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? bref.SourceWikipediaArticle = wiki.PageTitle bref.TargetReleaseIdent = target.Ident @@ -172,6 +188,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error if err := enc.Encode(bref); err != nil { return err } + brefPool.Put(bref) } return nil } @@ -270,6 +287,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) + bref BiblioRef grouper = func(g *zipkey.Group) error { var ( ref, pivot *Release // ref (reference), pivot (open library) @@ -292,7 +310,8 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { result := Verify(pivot, ref) switch result.Status { case StatusExact, StatusStrong: - var bref BiblioRef + bref = brefPool.Get().(BiblioRef) + bref.Reset() bref.SourceReleaseIdent = ref.Ident bref.SourceWorkIdent = ref.WorkID bref.SourceReleaseStage = ref.ReleaseStage @@ -306,6 +325,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { if err := enc.Encode(bref); err != nil { return err } + brefPool.Put(bref) default: } } -- cgit v1.2.3