From 1737eb03e6c7cd5d316ac081a4cea07787ad4429 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 10 Jul 2021 00:10:05 +0200 Subject: reduce: filter out duplicate wiki links --- skate/reduce.go | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'skate') diff --git a/skate/reduce.go b/skate/reduce.go index cd63bb1..255f281 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -158,6 +158,10 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil { return err } + // Sort out a few duplicates, e.g. + // lfqxs3tv_obj3cjr5wrhjffnmgze5jn7a4a, + // z2kc233qnfxwszbaojswgzlqorxxe_f7mn45dvyvespbv2pxgyt674k4, ... + seen := set.New() for _, line := range g.G1 { if wiki, err = parseWiki(Cut(line, 3)); err != nil { return err @@ -168,6 +172,10 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error key := fmt.Sprintf("%s_%s", strings.ToLower(b32enc.EncodeToString([]byte(wiki.PageTitle))), target.Ident) + if seen.Contains(key) { + continue + } + seen.Add(key) bref.Key = key bref.SourceWikipediaArticle = wiki.PageTitle bref.TargetReleaseIdent = target.Ident -- cgit v1.2.3