diff options
-rw-r--r-- | skate/zippy.go | 21 | ||||
-rw-r--r-- | skate/zippy_test.go | 4 |
2 files changed, 18 insertions, 7 deletions
diff --git a/skate/zippy.go b/skate/zippy.go index 1e660da..06793b4 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -320,6 +320,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { // ref index and key. func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { var ( + stats = statsAugment{} enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { @@ -328,7 +329,7 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { // First, iterate over all matches and sort out duplicates, e.g. // docs that have the same source and target id. - matched, err := uniqueMatches(CutBatch(g.G0, 2)) + matched, err := uniqueMatches(CutBatch(g.G0, 2), &stats) if err != nil { return err } @@ -343,7 +344,7 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { } refs[i] = &ref } - matched = matchedRefsExtend(matched, refs) + matched = matchedRefsExtend(matched, refs, &stats) for _, bref := range matched { if err := enc.Encode(bref); err != nil { return err @@ -363,13 +364,16 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { // set of biblioref docs (unchanged) plus raw references as biblioref, which // did not result in a match (determined by e.g. ref key and index). XXX: We // may have duplicate refs as well - how to distinguish them? -func matchedRefsExtend(matched []*BiblioRef, refs []*Ref) []*BiblioRef { +func matchedRefsExtend(matched []*BiblioRef, refs []*Ref, stats *statsAugment) []*BiblioRef { s := set.New() // store key + index of matched items for _, m := range matched { s.Add(m.RefKey + fmt.Sprintf("%d", m.RefIndex)) } for _, r := range refs { if s.Contains(r.Key + fmt.Sprintf("%d", r.Index)) { + stats.skipDuplicatedRef++ + log.Printf("skip-dup-ref [%d]: %v, %v", + stats.skipDuplicatedRef, r.Key, fmt.Sprintf("%d", r.Index)) continue } var bref BiblioRef @@ -392,7 +396,7 @@ func matchedRefsExtend(matched []*BiblioRef, refs []*Ref) []*BiblioRef { // uniqueMatches takes a list of bref docs (unserialized) and will return a // list of deserialized bref docs, containing unique matches only (e.g. filter // out things duplicate matches, e.g. from exact and fuzzy). -func uniqueMatches(docs []string) (result []*BiblioRef, err error) { +func uniqueMatches(docs []string, stats *statsAugment) (result []*BiblioRef, err error) { var ( brefs []*BiblioRef bref BiblioRef @@ -415,7 +419,9 @@ func uniqueMatches(docs []string) (result []*BiblioRef, err error) { for _, doc := range brefs { v := hash(doc) if seen.Contains(v) { - log.Printf("skip: %v %v %v", doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason) + stats.skipDuplicatedBref++ + log.Printf("skip [%d]: %v %v %v", + stats.skipDuplicatedBref, doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason) continue } seen.Add(v) @@ -424,6 +430,11 @@ func uniqueMatches(docs []string) (result []*BiblioRef, err error) { return result, nil } +type statsAugment struct { + skipDuplicatedBref int64 + skipDuplicatedRef int64 +} + // CutBatch runs Cut over a list of lines. func CutBatch(lines []string, column int) (result []string) { for _, line := range lines { diff --git a/skate/zippy_test.go b/skate/zippy_test.go index d0529c2..16aa74d 100644 --- a/skate/zippy_test.go +++ b/skate/zippy_test.go @@ -123,7 +123,7 @@ func TestUniqueMatches(t *testing.T) { }, } for _, c := range cases { - result, err := uniqueMatches(c.docs) + result, err := uniqueMatches(c.docs, &statsAugment{}) if err != c.err { t.Fatalf("got %v, want %v (%s)", err, c.err, c.about) } @@ -224,7 +224,7 @@ func TestMatchedRefsExtend(t *testing.T) { }, } for _, c := range cases { - result := matchedRefsExtend(c.matched, c.refs) + result := matchedRefsExtend(c.matched, c.refs, &statsAugment{}) if !reflect.DeepEqual(result, c.result) { t.Fatalf("got %v, want %v (%v)", result, c.result, pretty.Diff(result, c.result)) } |