diff options
Diffstat (limited to 'skate/zippy.go')
-rw-r--r-- | skate/zippy.go | 171 |
1 files changed, 171 insertions, 0 deletions
diff --git a/skate/zippy.go b/skate/zippy.go new file mode 100644 index 0000000..76f576d --- /dev/null +++ b/skate/zippy.go @@ -0,0 +1,171 @@ +package skate + +import ( + "fmt" + "io" + "strings" + + "git.archive.org/martin/cgraph/skate/zipkey" + json "github.com/segmentio/encoding/json" +) + +// This file contains the two-stream (zippy) matchers. + +// ZippyFixed takes a release and refs reader (tsv, with ident, key, doc) +// and assigns a fixed match result. +func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error { + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 2) + grouper = func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + ref, err := stringToRef(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + var bref BiblioRef + bref.SourceReleaseIdent = ref.ReleaseIdent + bref.SourceWorkIdent = ref.WorkIdent + bref.SourceReleaseStage = ref.ReleaseStage + bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) + bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) + bref.RefKey = ref.Key + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = mr.Status.Short() + bref.MatchReason = mr.Reason.Short() + if err := enc.Encode(bref); err != nil { + return err + } + } + return nil + } + ) + zipper := zipkey.New(releases, refs, keyer, grouper) + return zipper.Run() +} + +// ZippyFixedWiki takes a release and wiki reader (tsv, with ident, key, doc) +// and assigns a fixed match result. +func ZippyFixedWiki(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error { + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 2) + grouper = func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + wiki, err := stringToWiki(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + var bref BiblioRef + bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? + bref.SourceWikipediaArticle = wiki.PageTitle + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = mr.Status.Short() + bref.MatchReason = mr.Reason.Short() + if err := enc.Encode(bref); err != nil { + return err + } + } + return nil + } + ) + zipper := zipkey.New(releases, wiki, keyer, grouper) + return zipper.Run() +} + +// ZippyVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) +// and will execute gf for each group found. +func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { + // Define a grouper, working on one set of refs and releases with the same + // key at a time. Here, we do verification and write out the generated + // biblioref. + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 2) + grouper = func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + re, err := stringToRelease(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + result := Verify(pivot, re) + switch result.Status { + case StatusExact, StatusStrong: + if result.Reason == ReasonDOI { + continue + } + br := generateBiblioRef(re, pivot, result, "fuzzy") + if err := enc.Encode(br); err != nil { + return err + } + } + } + return nil + } + ) + zipper := zipkey.New(releases, refs, keyer, grouper) + return zipper.Run() +} + +// makeKeyFunc creates a function that can be used as keyFunc, selecting a +// column from sep. +func makeKeyFunc(sep string, column int) func(string) (string, error) { + return func(s string) (string, error) { + if k := lineColumn(s, "\t", 2); k == "" { + return k, fmt.Errorf("cannot get key: %s", s) + } else { + return k, nil + } + } +} + +// lineColumn returns a specific column (1-indexed, like cut) from a tabular +// file, returns empty string if column is invalid. +func lineColumn(line, sep string, column int) string { + parts := strings.Split(strings.TrimSpace(line), sep) + if len(parts) < column { + return "" + } else { + return parts[column-1] + } +} + +func stringToRelease(s string) (r *Release, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + +func stringToRef(s string) (r *Ref, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + +func stringToWiki(s string) (r *MinimalCitations, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} |