diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-15 19:46:36 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-15 19:46:36 +0200 |
commit | dec49fea12f7809b0ec80dca5d8d688dc1124a57 (patch) | |
tree | 0d8bb9d8b7a44256cc6ddcfec3634716d9669288 | |
parent | 49dcdd29fb814fccf19c678740317c59421db8ce (diff) | |
download | refcat-dec49fea12f7809b0ec80dca5d8d688dc1124a57.tar.gz refcat-dec49fea12f7809b0ec80dca5d8d688dc1124a57.zip |
zippy: add deduplicateBrefs
-rw-r--r-- | skate/zippy.go | 39 |
1 files changed, 35 insertions, 4 deletions
diff --git a/skate/zippy.go b/skate/zippy.go index 9691f6f..0d92873 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -1,9 +1,10 @@ -// This file contains various "reducers", e.g. working on two data streams and +// This file contains various "reducers", e.g. merging data from two streams and // applying a function on groups of documents with a shared key. // -// Note: This is a bit repetitive, but not want to introduce any other -// abstraction for now. Since most of the logic is in the grouper functions, we -// could make them top level and then assemble the zipkey runner on the fly. +// Note: This is a bit repetitive, but we do not want to introduce any other +// abstraction for now. Since most of the logic is in the "grouper" functions, +// we could make them top level values and then assemble the zipkey runner on +// the fly. // // The most confusing aspect currently is the variety of schemas hidden within // the readers (and string groups): release, ref, ref-as-release, open library, @@ -352,6 +353,7 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { // 4kg2dejsgzaf3cszs2lt5hz4by_9, which appears three times, one // exact match, and twice unmatched). // TODO: remove duplicates + matched = deduplicateBrefs(matched) for _, bref := range matched { stats.total++ if err := enc.Encode(bref); err != nil { @@ -367,6 +369,35 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { return err } +// deduplicateBrefs deduplicates by the document id (for elasticsearch), which +// may help filter out some duplicates but not all. +func deduplicateBrefs(brefs []*BiblioRef) []*BiblioRef { + // Sort by match status, exact first, unmatched last. + sort.Slice(brefs, func(i, j int) bool { + switch { + case brefs[i].MatchStatus == StatusExact.Short(): + return true + case brefs[i].MatchStatus != StatusUnmatched.Short(): + return true + default: + return false + } + }) + var ( + unique []*BiblioRef + seen = set.New() + ) + for _, v := range brefs { + if seen.Contains(v.Key) { + continue + } + unique = append(unique, v) + seen.Add(v.Key) + } + log.Printf("trimmed brefs from %d to %d", len(brefs), len(unique)) + return unique +} + // matchedRefsExtend takes a set of (unique) biblioref docs and will emit that // set of biblioref docs (unchanged) plus raw references as biblioref, which // did not result in a match (determined by e.g. ref key and index). XXX: We |