zippy: add deduplicateBrefs

author: Martin Czygan <martin.czygan@gmail.com> 2021-06-15 19:46:36 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-06-15 19:46:36 +0200
commit: dec49fea12f7809b0ec80dca5d8d688dc1124a57 (patch)
tree: 0d8bb9d8b7a44256cc6ddcfec3634716d9669288
parent: 49dcdd29fb814fccf19c678740317c59421db8ce (diff)
download: refcat-dec49fea12f7809b0ec80dca5d8d688dc1124a57.tar.gz
refcat-dec49fea12f7809b0ec80dca5d8d688dc1124a57.zip
1 files changed, 35 insertions, 4 deletions
diff --git a/skate/zippy.go b/skate/zippy.go
index 9691f6f..0d92873 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -1,9 +1,10 @@
-// This file contains various "reducers", e.g. working on two data streams and
+// This file contains various "reducers", e.g. merging data from two streams and
 // applying a function on groups of documents with a shared key.
 //
-// Note: This is a bit repetitive, but not want to introduce any other
-// abstraction for now. Since most of the logic is in the grouper functions, we
-// could make them top level and then assemble the zipkey runner on the fly.
+// Note: This is a bit repetitive, but we do not want to introduce any other
+// abstraction for now. Since most of the logic is in the "grouper" functions,
+// we could make them top level values and then assemble the zipkey runner on
+// the fly.
 //
 // The most confusing aspect currently is the variety of schemas hidden within
 // the readers (and string groups): release, ref, ref-as-release, open library,
@@ -352,6 +353,7 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
 			// 4kg2dejsgzaf3cszs2lt5hz4by_9, which appears three times, one
 			// exact match, and twice unmatched).
 			// TODO: remove duplicates
+			matched = deduplicateBrefs(matched)
 			for _, bref := range matched {
 				stats.total++
 				if err := enc.Encode(bref); err != nil {
@@ -367,6 +369,35 @@ func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error {
 	return err
 }
 
+// deduplicateBrefs deduplicates by the document id (for elasticsearch), which
+// may help filter out some duplicates but not all.
+func deduplicateBrefs(brefs []*BiblioRef) []*BiblioRef {
+	// Sort by match status, exact first, unmatched last.
+	sort.Slice(brefs, func(i, j int) bool {
+		switch {
+		case brefs[i].MatchStatus == StatusExact.Short():
+			return true
+		case brefs[i].MatchStatus != StatusUnmatched.Short():
+			return true
+		default:
+			return false
+		}
+	})
+	var (
+		unique []*BiblioRef
+		seen   = set.New()
+	)
+	for _, v := range brefs {
+		if seen.Contains(v.Key) {
+			continue
+		}
+		unique = append(unique, v)
+		seen.Add(v.Key)
+	}
+	log.Printf("trimmed brefs from %d to %d", len(brefs), len(unique))
+	return unique
+}
+
 // matchedRefsExtend takes a set of (unique) biblioref docs and will emit that
 // set of biblioref docs (unchanged) plus raw references as biblioref, which
 // did not result in a match (determined by e.g. ref key and index).  XXX: We
author	Martin Czygan <martin.czygan@gmail.com>	2021-06-15 19:46:36 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-06-15 19:46:36 +0200
commit	dec49fea12f7809b0ec80dca5d8d688dc1124a57 (patch)
tree	0d8bb9d8b7a44256cc6ddcfec3634716d9669288
parent	49dcdd29fb814fccf19c678740317c59421db8ce (diff)
download	refcat-dec49fea12f7809b0ec80dca5d8d688dc1124a57.tar.gz refcat-dec49fea12f7809b0ec80dca5d8d688dc1124a57.zip