From c801b2ff0aa93489eed89ddb1f2d62404fc89ca2 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 5 May 2021 16:54:48 +0200 Subject: split functionality up a bit --- skate/cmd/skate-verify/main.go | 6 +- skate/schema.go | 2 +- skate/verify.go | 176 +++-------------------------------------- skate/zippy.go | 171 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 185 insertions(+), 170 deletions(-) create mode 100644 skate/zippy.go (limited to 'skate') diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go index 3b40488..895d508 100644 --- a/skate/cmd/skate-verify/main.go +++ b/skate/cmd/skate-verify/main.go @@ -81,7 +81,7 @@ func main() { if !ok { mr = matchResults["unknown"] } - if err := skate.ZipUnverified(f, g, mr, *provenance, bw); err != nil { + if err := skate.ZippyFixed(f, g, mr, *provenance, bw); err != nil { log.Fatal(err) } case "zip": @@ -102,7 +102,7 @@ func main() { defer g.Close() bw := bufio.NewWriter(os.Stdout) defer bw.Flush() - if err := skate.ZipVerifyRefs(f, g, bw); err != nil { + if err := skate.ZippyVerifyRefs(f, g, bw); err != nil { log.Fatal(err) } case "ref": @@ -138,7 +138,7 @@ func main() { defer g.Close() bw := bufio.NewWriter(os.Stdout) defer bw.Flush() - if err := skate.ZipWikiUnverified(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil { + if err := skate.ZippyFixedWiki(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil { log.Fatal(err) } default: diff --git a/skate/schema.go b/skate/schema.go index 1878205..52aa91a 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -305,7 +305,7 @@ type BiblioRef struct { } // ReleaseCluster, a list of match candidates. This is typically serialized as a -// single JSON line. +// single JSON line containing the match key and a list of release documents. type ReleaseCluster struct { Key string `json:"k"` Values []*Release `json:"v"` diff --git a/skate/verify.go b/skate/verify.go index 5367ffe..18b2f4e 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -14,11 +14,10 @@ import ( "unicode/utf8" "git.archive.org/martin/cgraph/skate/set" - "git.archive.org/martin/cgraph/skate/zipkey" json "github.com/segmentio/encoding/json" ) -// This file is a port of fuzzycat.verify to Go. +// This file contains a port of fuzzycat.verify to Go. type ( // Status represents match strength. @@ -135,7 +134,7 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) { } // ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a -// port of https://git.io/JYgOB from fuzzycat. +// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification. func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { var ( rc *ReleaseCluster @@ -170,14 +169,16 @@ func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { } // ClusterVerify runs verification process across all pairs, but skips clusters -// containing more than ten elements. +// containing more than ten elements. If a cluster has more then 10 elements, +// it might also signal a too ambiguous title. Beside, we do not want this to +// be too slow. func ClusterVerify(p []byte) ([]byte, error) { return ClusterVerifyMaxClusterSize(p, 10) } // RefClusterVerify deserializes a cluster document containing both converted // references and releases and returns a tabular verification result between -// one release and all references found. This depends on refs and releases +// one (any) release and all references found. This depends on refs and releases // being distinguishable, (e.g. via .extra.skate.status == "ref"). func RefClusterVerify(p []byte) ([]byte, error) { var ( @@ -208,7 +209,8 @@ func RefClusterVerify(p []byte) ([]byte, error) { return buf.Bytes(), nil } -// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only. +// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from +// exact and strong matches only. func RefClusterToBiblioRef(p []byte) ([]byte, error) { var ( rc *ReleaseCluster @@ -236,13 +238,14 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { br = generateBiblioRef(re, pivot, result, "fuzzy") return JsonMarshalNewline(br) default: + // XXX: may want to include non matches here. continue } } return buf.Bytes(), nil } -// generateBiblioRef generates a bibliographic schema document. +// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { var bref BiblioRef bref.SourceReleaseIdent = source.Ident @@ -261,165 +264,6 @@ func generateBiblioRef(source, target *Release, matchResult MatchResult, provena return &bref } -// makeKeyFunc creates a function that can be used as keyFunc, selecting a -// column from sep. -func makeKeyFunc(sep string, column int) func(string) (string, error) { - return func(s string) (string, error) { - if k := lineColumn(s, "\t", 2); k == "" { - return k, fmt.Errorf("cannot get key: %s", s) - } else { - return k, nil - } - } -} - -// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc) -// and assigns a fixed match result. -func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error { - var ( - enc = json.NewEncoder(w) - keyer = makeKeyFunc("\t", 2) - grouper = func(g *zipkey.Group) error { - if len(g.G0) == 0 || len(g.G1) == 0 { - return nil - } - target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) - if err != nil { - return err - } - for _, line := range g.G1 { - ref, err := stringToRef(lineColumn(line, "\t", 3)) - if err != nil { - return err - } - var bref BiblioRef - bref.SourceReleaseIdent = ref.ReleaseIdent - bref.SourceWorkIdent = ref.WorkIdent - bref.SourceReleaseStage = ref.ReleaseStage - bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) - bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) - bref.RefKey = ref.Key - bref.TargetReleaseIdent = target.Ident - bref.TargetWorkIdent = target.WorkID - bref.MatchProvenance = provenance - bref.MatchStatus = mr.Status.Short() - bref.MatchReason = mr.Reason.Short() - if err := enc.Encode(bref); err != nil { - return err - } - } - return nil - } - ) - zipper := zipkey.New(releases, refs, keyer, grouper) - return zipper.Run() -} - -// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc) -// and assigns a fixed match result. -func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error { - var ( - enc = json.NewEncoder(w) - keyer = makeKeyFunc("\t", 2) - grouper = func(g *zipkey.Group) error { - if len(g.G0) == 0 || len(g.G1) == 0 { - return nil - } - target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) - if err != nil { - return err - } - for _, line := range g.G1 { - wiki, err := stringToWiki(lineColumn(line, "\t", 3)) - if err != nil { - return err - } - var bref BiblioRef - bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? - bref.SourceWikipediaArticle = wiki.PageTitle - bref.TargetReleaseIdent = target.Ident - bref.TargetWorkIdent = target.WorkID - bref.MatchProvenance = provenance - bref.MatchStatus = mr.Status.Short() - bref.MatchReason = mr.Reason.Short() - if err := enc.Encode(bref); err != nil { - return err - } - } - return nil - } - ) - zipper := zipkey.New(releases, wiki, keyer, grouper) - return zipper.Run() -} - -// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) -// and will execute gf for each group found. -func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error { - // Define a grouper, working on one set of refs and releases with the same - // key at a time. Here, we do verification and write out the generated - // biblioref. - var ( - enc = json.NewEncoder(w) - keyer = makeKeyFunc("\t", 2) - grouper = func(g *zipkey.Group) error { - if len(g.G0) == 0 || len(g.G1) == 0 { - return nil - } - pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) - if err != nil { - return err - } - for _, line := range g.G1 { - re, err := stringToRelease(lineColumn(line, "\t", 3)) - if err != nil { - return err - } - result := Verify(pivot, re) - switch result.Status { - case StatusExact, StatusStrong: - if result.Reason == ReasonDOI { - continue - } - br := generateBiblioRef(re, pivot, result, "fuzzy") - if err := enc.Encode(br); err != nil { - return err - } - } - } - return nil - } - ) - zipper := zipkey.New(releases, refs, keyer, grouper) - return zipper.Run() -} - -// lineColumn returns a specific column (1-indexed, like cut) from a tabular -// file, returns empty string if column is invalid. -func lineColumn(line, sep string, column int) string { - parts := strings.Split(strings.TrimSpace(line), sep) - if len(parts) < column { - return "" - } else { - return parts[column-1] - } -} - -func stringToRelease(s string) (r *Release, err error) { - err = json.Unmarshal([]byte(s), &r) - return -} - -func stringToRef(s string) (r *Ref, err error) { - err = json.Unmarshal([]byte(s), &r) - return -} - -func stringToWiki(s string) (r *MinimalCitations, err error) { - err = json.Unmarshal([]byte(s), &r) - return -} - // Verify verifies two releases and will ignore short titles. func Verify(a, b *Release) MatchResult { return VerifyMinTitleLength(a, b, 5) diff --git a/skate/zippy.go b/skate/zippy.go new file mode 100644 index 0000000..76f576d --- /dev/null +++ b/skate/zippy.go @@ -0,0 +1,171 @@ +package skate + +import ( + "fmt" + "io" + "strings" + + "git.archive.org/martin/cgraph/skate/zipkey" + json "github.com/segmentio/encoding/json" +) + +// This file contains the two-stream (zippy) matchers. + +// ZippyFixed takes a release and refs reader (tsv, with ident, key, doc) +// and assigns a fixed match result. +func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error { + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 2) + grouper = func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + ref, err := stringToRef(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + var bref BiblioRef + bref.SourceReleaseIdent = ref.ReleaseIdent + bref.SourceWorkIdent = ref.WorkIdent + bref.SourceReleaseStage = ref.ReleaseStage + bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) + bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) + bref.RefKey = ref.Key + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = mr.Status.Short() + bref.MatchReason = mr.Reason.Short() + if err := enc.Encode(bref); err != nil { + return err + } + } + return nil + } + ) + zipper := zipkey.New(releases, refs, keyer, grouper) + return zipper.Run() +} + +// ZippyFixedWiki takes a release and wiki reader (tsv, with ident, key, doc) +// and assigns a fixed match result. +func ZippyFixedWiki(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error { + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 2) + grouper = func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + wiki, err := stringToWiki(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + var bref BiblioRef + bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? + bref.SourceWikipediaArticle = wiki.PageTitle + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = mr.Status.Short() + bref.MatchReason = mr.Reason.Short() + if err := enc.Encode(bref); err != nil { + return err + } + } + return nil + } + ) + zipper := zipkey.New(releases, wiki, keyer, grouper) + return zipper.Run() +} + +// ZippyVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) +// and will execute gf for each group found. +func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { + // Define a grouper, working on one set of refs and releases with the same + // key at a time. Here, we do verification and write out the generated + // biblioref. + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 2) + grouper = func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + re, err := stringToRelease(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + result := Verify(pivot, re) + switch result.Status { + case StatusExact, StatusStrong: + if result.Reason == ReasonDOI { + continue + } + br := generateBiblioRef(re, pivot, result, "fuzzy") + if err := enc.Encode(br); err != nil { + return err + } + } + } + return nil + } + ) + zipper := zipkey.New(releases, refs, keyer, grouper) + return zipper.Run() +} + +// makeKeyFunc creates a function that can be used as keyFunc, selecting a +// column from sep. +func makeKeyFunc(sep string, column int) func(string) (string, error) { + return func(s string) (string, error) { + if k := lineColumn(s, "\t", 2); k == "" { + return k, fmt.Errorf("cannot get key: %s", s) + } else { + return k, nil + } + } +} + +// lineColumn returns a specific column (1-indexed, like cut) from a tabular +// file, returns empty string if column is invalid. +func lineColumn(line, sep string, column int) string { + parts := strings.Split(strings.TrimSpace(line), sep) + if len(parts) < column { + return "" + } else { + return parts[column-1] + } +} + +func stringToRelease(s string) (r *Release, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + +func stringToRef(s string) (r *Ref, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + +func stringToWiki(s string) (r *MinimalCitations, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} -- cgit v1.2.3