diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/zippy.go | 59 |
1 files changed, 58 insertions, 1 deletions
diff --git a/skate/zippy.go b/skate/zippy.go index 5767b34..bd7bf70 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -14,8 +14,10 @@ import ( "fmt" "io" "log" + "sort" "strings" + "git.archive.org/martin/cgraph/skate/set" "git.archive.org/martin/cgraph/skate/zipkey" json "github.com/segmentio/encoding/json" ) @@ -316,7 +318,57 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { // // We can identify, which docs have been matched by checking the ref key and index. func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { - return nil + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 1) + grouper = func(g *zipkey.Group) error { + // g.G0 contains a matched docs for a given work id, g.G1 all raw + // refs, with the same work id. + + // First, iterate over all matches and sort out duplicates, e.g. + // docs that have the same source and target id. + uniqueBref, err := uniqueMatches(g.G0) + if err != nil { + return err + } + return nil + } + ) + zipper := zipkey.New(bref, raw, keyer, grouper) + return zipper.Run() +} + +// uniqueMatches takes a list of bref docs (unserialized) and will return a +// list of serialized bref docs, containing unique matches. +func uniqueMatches(docs []string) (result []*BiblioRef, err error) { + var ( + brefs []*BiblioRef + bref BiblioRef + ) + for _, doc := range docs { + if err := json.Unmarshal([]byte(doc), &bref); err != nil { + return nil, err + } + brefs = append(brefs, &bref) + } + // Make sure we exact matches come first. + sort.Slice(brefs, func(i, j int) bool { + return brefs[i].MatchStatus != StatusExact.Short() + }) + // We consider a match unique, if source and target match. + hash := func(bref *BiblioRef) string { + return bref.SourceReleaseIdent + bref.TargetReleaseIdent + } + seen := set.New() + for _, doc := range brefs { + v := hash(doc) + if seen.Contains(v) { + continue + } + seen.Add(v) + result = append(result, doc) + } + return result, nil } // Cut returns a specific column (1-indexed, like CutSep) from a tabular @@ -371,3 +423,8 @@ func stringToWiki(s string) (r *MinimalCitations, err error) { err = json.Unmarshal([]byte(s), &r) return } + +func stringToBiblioref(s string) (r *BiblioRef, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} |