aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/zippy.go21
-rw-r--r--skate/zippy_test.go4
2 files changed, 18 insertions, 7 deletions
diff --git a/skate/zippy.go b/skate/zippy.go
index 1e660da..06793b4 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -320,6 +320,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
// ref index and key.
func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
var (
+ stats = statsAugment{}
enc = json.NewEncoder(w)
keyer = makeKeyFunc("\t", 1)
grouper = func(g *zipkey.Group) error {
@@ -328,7 +329,7 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
// First, iterate over all matches and sort out duplicates, e.g.
// docs that have the same source and target id.
- matched, err := uniqueMatches(CutBatch(g.G0, 2))
+ matched, err := uniqueMatches(CutBatch(g.G0, 2), &stats)
if err != nil {
return err
}
@@ -343,7 +344,7 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
}
refs[i] = &ref
}
- matched = matchedRefsExtend(matched, refs)
+ matched = matchedRefsExtend(matched, refs, &stats)
for _, bref := range matched {
if err := enc.Encode(bref); err != nil {
return err
@@ -363,13 +364,16 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
// set of biblioref docs (unchanged) plus raw references as biblioref, which
// did not result in a match (determined by e.g. ref key and index). XXX: We
// may have duplicate refs as well - how to distinguish them?
-func matchedRefsExtend(matched []*BiblioRef, refs []*Ref) []*BiblioRef {
+func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) []*BiblioRef {
s := set.New() // store key + index of matched items
for _, m := range matched {
s.Add(m.RefKey + fmt.Sprintf("%d", m.RefIndex))
}
for _, r := range refs {
if s.Contains(r.Key + fmt.Sprintf("%d", r.Index)) {
+ stats.skipDuplicatedRef++
+ log.Printf("skip-dup-ref [%d]: %v, %v",
+ stats.skipDuplicatedRef, r.Key, fmt.Sprintf("%d", r.Index))
continue
}
var bref BiblioRef
@@ -392,7 +396,7 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref) []*BiblioRef {
// uniqueMatches takes a list of bref docs (unserialized) and will return a
// list of deserialized bref docs, containing unique matches only (e.g. filter
// out things duplicate matches, e.g. from exact and fuzzy).
-func uniqueMatches(docs []string) (result []*BiblioRef, err error) {
+func uniqueMatches(docs []string, stats *statsAugment) (result []*BiblioRef, err error) {
var (
brefs []*BiblioRef
bref BiblioRef
@@ -415,7 +419,9 @@ func uniqueMatches(docs []string) (result []*BiblioRef, err error) {
for _, doc := range brefs {
v := hash(doc)
if seen.Contains(v) {
- log.Printf("skip: %v %v %v", doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason)
+ stats.skipDuplicatedBref++
+ log.Printf("skip [%d]: %v %v %v",
+ stats.skipDuplicatedBref, doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason)
continue
}
seen.Add(v)
@@ -424,6 +430,11 @@ func uniqueMatches(docs []string) (result []*BiblioRef, err error) {
return result, nil
}
+type statsAugment struct {
+ skipDuplicatedBref int64
+ skipDuplicatedRef int64
+}
+
// CutBatch runs Cut over a list of lines.
func CutBatch(lines []string, column int) (result []string) {
for _, line := range lines {
diff --git a/skate/zippy_test.go b/skate/zippy_test.go
index d0529c2..16aa74d 100644
--- a/skate/zippy_test.go
+++ b/skate/zippy_test.go
@@ -123,7 +123,7 @@ func TestUniqueMatches(t *testing.T) {
},
}
for _, c := range cases {
- result, err := uniqueMatches(c.docs)
+ result, err := uniqueMatches(c.docs, &statsAugment{})
if err != c.err {
t.Fatalf("got %v, want %v (%s)", err, c.err, c.about)
}
@@ -224,7 +224,7 @@ func TestMatchedRefsExtend(t *testing.T) {
},
}
for _, c := range cases {
- result := matchedRefsExtend(c.matched, c.refs)
+ result := matchedRefsExtend(c.matched, c.refs, &statsAugment{})
if !reflect.DeepEqual(result, c.result) {
t.Fatalf("got %v, want %v (%v)", result, c.result, pretty.Diff(result, c.result))
}