package skate import ( "fmt" "io" "log" "strings" "git.archive.org/martin/cgraph/skate/zipkey" json "github.com/segmentio/encoding/json" ) // This file contains the two-stream (zippy) matchers. // groupLogf logs a message and a serialized group. func groupLogf(g *zipkey.Group, s string, vs ...interface{}) { log.Printf(s, vs...) b, _ := json.Marshal(g) log.Println(string(b)) } // ZippyExact takes a release and refs reader (tsv, with ident, key, doc) // and assigns a fixed match result. XXX: allow empty keys func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) error { var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) i = 0 grouper = func(g *zipkey.Group) error { i++ if i%10000 == 0 { log.Printf("processed %v groups", i) } var ( target *Release ref *Ref err error ) if len(g.G0) == 0 || len(g.G1) == 0 { return nil } if target, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil { groupLogf(g, "[skip] failed to parse release: %v", err) return nil } for _, line := range g.G1 { if ref, err = stringToRef(cut(line, "\t", 2)); err != nil { groupLogf(g, "[skip] failed to parse ref: %v", err) continue } var bref BiblioRef bref.SourceReleaseIdent = ref.ReleaseIdent bref.SourceWorkIdent = ref.WorkIdent bref.SourceReleaseStage = ref.ReleaseStage bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) bref.RefKey = ref.Key bref.TargetReleaseIdent = target.Ident bref.TargetWorkIdent = target.WorkID bref.MatchProvenance = ref.RefSource bref.MatchStatus = matchResult.Status.Short() bref.MatchReason = matchResult.Reason.Short() if err := enc.Encode(bref); err != nil { return err } } return nil } ) zipper := zipkey.New(releases, refs, keyer, grouper) return zipper.Run() } // ZippyExactWiki takes a release and wiki reader (tsv, with key, doc) // and assigns a fixed match result. func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error { var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { var ( target *Release wiki *MinimalCitations err error ) if len(g.G0) == 0 || len(g.G1) == 0 { return nil } if target, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil { return err } for _, line := range g.G1 { if wiki, err = stringToWiki(cut(line, "\t", 2)); err != nil { return err } var bref BiblioRef bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? bref.SourceWikipediaArticle = wiki.PageTitle bref.TargetReleaseIdent = target.Ident bref.TargetWorkIdent = target.WorkID bref.MatchProvenance = "wikipedia" bref.MatchStatus = mr.Status.Short() bref.MatchReason = mr.Reason.Short() if err := enc.Encode(bref); err != nil { return err } } return nil } ) zipper := zipkey.New(releases, wiki, keyer, grouper) return zipper.Run() } // ZippyVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) // and will execute gf for each group found. func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { // Define a grouper, working on one set of refs and releases with the same // key at a time. Here, we do verification and write out the generated // biblioref. var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { var ( re, pivot *Release err error ) if len(g.G0) == 0 || len(g.G1) == 0 { return nil } if pivot, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil { return err } for _, line := range g.G1 { if re, err = stringToRelease(cut(line, "\t", 2)); err != nil { return err } result := Verify(pivot, re) switch result.Status { case StatusExact, StatusStrong: if result.Reason == ReasonDOI { continue } br := generateBiblioRef(re, pivot, result, "fuzzy") if err := enc.Encode(br); err != nil { return err } default: // XXX: We want to add unmatched pieces as well; here? We // probably want to do a single final pass to complete the // dataset. } } return nil } ) zipper := zipkey.New(releases, refs, keyer, grouper) return zipper.Run() } // ZippyRefsOpenLibrary takes a release and refs reader (tsv, with ident, key, doc) // and will execute gf for each group found. func ZippyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { // Define a grouper, working on one set of refs and releases with the same // key at a time. Here, we do verification and write out the generated // biblioref. var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { var ( re, pivot *Release err error ) if len(g.G0) == 0 || len(g.G1) == 0 { return nil } if pivot, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil { return err } for _, line := range g.G1 { if re, err = stringToRelease(cut(line, "\t", 2)); err != nil { return err } result := Verify(pivot, re) switch result.Status { case StatusExact, StatusStrong: if result.Reason == ReasonDOI { continue } br := generateBiblioRef(re, pivot, result, "fuzzy") if err := enc.Encode(br); err != nil { return err } default: // XXX: We want to add unmatched pieces as well; here? We // probably want to do a single final pass to complete the // dataset. } } return nil } ) zipper := zipkey.New(olr, refs, keyer, grouper) return zipper.Run() } // makeKeyFunc creates a function that can be used as keyFunc, selecting a // column from fields separated by sep; column is 1-indexed. func makeKeyFunc(sep string, column int) func(string) (string, error) { return func(s string) (string, error) { if k := cut(s, sep, column); k == "" { return k, fmt.Errorf("cannot get key: %s", s) } else { return k, nil } } } // cut returns a specific column (1-indexed, like cut) from a tabular // file, returns empty string if column is invalid. func cut(line, sep string, column int) string { parts := strings.Split(strings.TrimSpace(line), sep) if len(parts) < column { return "" } else { return parts[column-1] } } func stringToRelease(s string) (r *Release, err error) { err = json.Unmarshal([]byte(s), &r) return } func stringToRef(s string) (r *Ref, err error) { err = json.Unmarshal([]byte(s), &r) return } func stringToWiki(s string) (r *MinimalCitations, err error) { err = json.Unmarshal([]byte(s), &r) return }