aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-02 21:44:49 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-03 00:02:55 +0200
commit6a11c515df6b9d7c707d96dac2b52098153e39f4 (patch)
treea86a35873849f045c1885182e7bd2094c8c15c6b /skate
parent5504eacd27d6f3ea8d904904728d68efe85e4814 (diff)
downloadrefcat-6a11c515df6b9d7c707d96dac2b52098153e39f4.tar.gz
refcat-6a11c515df6b9d7c707d96dac2b52098153e39f4.zip
reduce allocations during bref encoding
Diffstat (limited to 'skate')
-rw-r--r--skate/schema.go23
-rw-r--r--skate/zippy.go30
2 files changed, 48 insertions, 5 deletions
diff --git a/skate/schema.go b/skate/schema.go
index 8554e14..d35272e 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -350,6 +350,29 @@ type BiblioRef struct {
TargetCSL string `json:"target_csl,omitempty"`
}
+func (b *BiblioRef) Reset() {
+ b.Key = ""
+ b.IndexedTs = ""
+ b.SourceReleaseIdent = ""
+ b.SourceWorkIdent = ""
+ b.SourceWikipediaArticle = ""
+ b.SourceReleaseStage = ""
+ b.SourceYear = ""
+ b.RefIndex = 0
+ b.RefKey = ""
+ b.RefLocator = ""
+ b.TargetReleaseIdent = ""
+ b.TargetWorkIdent = ""
+ b.TargetOpenLibraryWork = ""
+ b.TargetURLSurt = ""
+ b.TargetURL = ""
+ b.MatchProvenance = ""
+ b.MatchStatus = ""
+ b.MatchReason = ""
+ b.TargetUnstructured = ""
+ b.TargetCSL = ""
+}
+
// Hash returns a string that will be the same, if source and target are
// equal; different otherwise. This can be used to detect duplicate links.
func (b *BiblioRef) LinkHash() string {
diff --git a/skate/zippy.go b/skate/zippy.go
index b69ce2b..ff836e8 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -20,6 +20,7 @@ import (
"log"
"sort"
"strings"
+ "sync"
"time"
"git.archive.org/martin/cgraph/skate/set"
@@ -27,6 +28,13 @@ import (
json "github.com/segmentio/encoding/json"
)
+var brefPool = sync.Pool{
+ New: func() interface{} {
+ var bref BiblioRef
+ return bref
+ },
+}
+
// groupLogf logs a message alongsize a serialized group for debugging.
func groupLogf(g *zipkey.Group, s string, vs ...interface{}) {
log.Printf(s, vs...)
@@ -35,12 +43,13 @@ func groupLogf(g *zipkey.Group, s string, vs ...interface{}) {
}
// ZippyExact takes a release and refs reader (key, doc) and assigns a fixed
-// match result.
+// match result, e.g. for doi matches.
func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) error {
var (
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
i = 0
+ bref BiblioRef
grouper = func(g *zipkey.Group) error {
i++
if i%10000 == 0 {
@@ -63,7 +72,8 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
groupLogf(g, "[skip] failed to parse ref: %v", err)
continue
}
- var bref BiblioRef
+ bref = brefPool.Get().(BiblioRef)
+ bref.Reset()
bref.SourceReleaseIdent = ref.ReleaseIdent
bref.SourceWorkIdent = ref.WorkIdent
bref.SourceReleaseStage = ref.ReleaseStage
@@ -78,6 +88,7 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer)
if err := enc.Encode(bref); err != nil {
return err
}
+ brefPool.Put(bref)
}
return nil
}
@@ -93,6 +104,7 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
i = 0
+ bref BiblioRef
grouper = func(g *zipkey.Group) error {
i++
if i%10000 == 0 {
@@ -117,7 +129,8 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W
if target.WorkID == "" {
continue
}
- var bref BiblioRef
+ bref = brefPool.Get().(BiblioRef)
+ bref.Reset()
bref.SourceReleaseIdent = re.Ident
bref.SourceWorkIdent = re.WorkID
bref.SourceReleaseStage = re.ReleaseStage
@@ -131,6 +144,7 @@ func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.W
if err := enc.Encode(bref); err != nil {
return err
}
+ brefPool.Put(bref)
}
return nil
}
@@ -145,6 +159,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
var (
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
+ bref BiblioRef
grouper = func(g *zipkey.Group) error {
var (
target *Release
@@ -161,7 +176,8 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
if wiki, err = parseWiki(Cut(line, 2)); err != nil {
return err
}
- var bref BiblioRef
+ bref = brefPool.Get().(BiblioRef)
+ bref.Reset()
bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
bref.SourceWikipediaArticle = wiki.PageTitle
bref.TargetReleaseIdent = target.Ident
@@ -172,6 +188,7 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error
if err := enc.Encode(bref); err != nil {
return err
}
+ brefPool.Put(bref)
}
return nil
}
@@ -270,6 +287,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
var (
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
+ bref BiblioRef
grouper = func(g *zipkey.Group) error {
var (
ref, pivot *Release // ref (reference), pivot (open library)
@@ -292,7 +310,8 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
result := Verify(pivot, ref)
switch result.Status {
case StatusExact, StatusStrong:
- var bref BiblioRef
+ bref = brefPool.Get().(BiblioRef)
+ bref.Reset()
bref.SourceReleaseIdent = ref.Ident
bref.SourceWorkIdent = ref.WorkID
bref.SourceReleaseStage = ref.ReleaseStage
@@ -306,6 +325,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
if err := enc.Encode(bref); err != nil {
return err
}
+ brefPool.Put(bref)
default:
}
}