diff options
Diffstat (limited to 'skate/verify.go')
-rw-r--r-- | skate/verify.go | 176 |
1 files changed, 10 insertions, 166 deletions
diff --git a/skate/verify.go b/skate/verify.go index 5367ffe..18b2f4e 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -14,11 +14,10 @@ import ( "unicode/utf8" "git.archive.org/martin/cgraph/skate/set" - "git.archive.org/martin/cgraph/skate/zipkey" json "github.com/segmentio/encoding/json" ) -// This file is a port of fuzzycat.verify to Go. +// This file contains a port of fuzzycat.verify to Go. type ( // Status represents match strength. @@ -135,7 +134,7 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) { } // ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a -// port of https://git.io/JYgOB from fuzzycat. +// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification. func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { var ( rc *ReleaseCluster @@ -170,14 +169,16 @@ func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { } // ClusterVerify runs verification process across all pairs, but skips clusters -// containing more than ten elements. +// containing more than ten elements. If a cluster has more then 10 elements, +// it might also signal a too ambiguous title. Beside, we do not want this to +// be too slow. func ClusterVerify(p []byte) ([]byte, error) { return ClusterVerifyMaxClusterSize(p, 10) } // RefClusterVerify deserializes a cluster document containing both converted // references and releases and returns a tabular verification result between -// one release and all references found. This depends on refs and releases +// one (any) release and all references found. This depends on refs and releases // being distinguishable, (e.g. via .extra.skate.status == "ref"). func RefClusterVerify(p []byte) ([]byte, error) { var ( @@ -208,7 +209,8 @@ func RefClusterVerify(p []byte) ([]byte, error) { return buf.Bytes(), nil } -// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only. +// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from +// exact and strong matches only. func RefClusterToBiblioRef(p []byte) ([]byte, error) { var ( rc *ReleaseCluster @@ -236,13 +238,14 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { br = generateBiblioRef(re, pivot, result, "fuzzy") return JsonMarshalNewline(br) default: + // XXX: may want to include non matches here. continue } } return buf.Bytes(), nil } -// generateBiblioRef generates a bibliographic schema document. +// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { var bref BiblioRef bref.SourceReleaseIdent = source.Ident @@ -261,165 +264,6 @@ func generateBiblioRef(source, target *Release, matchResult MatchResult, provena return &bref } -// makeKeyFunc creates a function that can be used as keyFunc, selecting a -// column from sep. -func makeKeyFunc(sep string, column int) func(string) (string, error) { - return func(s string) (string, error) { - if k := lineColumn(s, "\t", 2); k == "" { - return k, fmt.Errorf("cannot get key: %s", s) - } else { - return k, nil - } - } -} - -// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc) -// and assigns a fixed match result. -func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error { - var ( - enc = json.NewEncoder(w) - keyer = makeKeyFunc("\t", 2) - grouper = func(g *zipkey.Group) error { - if len(g.G0) == 0 || len(g.G1) == 0 { - return nil - } - target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) - if err != nil { - return err - } - for _, line := range g.G1 { - ref, err := stringToRef(lineColumn(line, "\t", 3)) - if err != nil { - return err - } - var bref BiblioRef - bref.SourceReleaseIdent = ref.ReleaseIdent - bref.SourceWorkIdent = ref.WorkIdent - bref.SourceReleaseStage = ref.ReleaseStage - bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) - bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) - bref.RefKey = ref.Key - bref.TargetReleaseIdent = target.Ident - bref.TargetWorkIdent = target.WorkID - bref.MatchProvenance = provenance - bref.MatchStatus = mr.Status.Short() - bref.MatchReason = mr.Reason.Short() - if err := enc.Encode(bref); err != nil { - return err - } - } - return nil - } - ) - zipper := zipkey.New(releases, refs, keyer, grouper) - return zipper.Run() -} - -// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc) -// and assigns a fixed match result. -func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error { - var ( - enc = json.NewEncoder(w) - keyer = makeKeyFunc("\t", 2) - grouper = func(g *zipkey.Group) error { - if len(g.G0) == 0 || len(g.G1) == 0 { - return nil - } - target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) - if err != nil { - return err - } - for _, line := range g.G1 { - wiki, err := stringToWiki(lineColumn(line, "\t", 3)) - if err != nil { - return err - } - var bref BiblioRef - bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? - bref.SourceWikipediaArticle = wiki.PageTitle - bref.TargetReleaseIdent = target.Ident - bref.TargetWorkIdent = target.WorkID - bref.MatchProvenance = provenance - bref.MatchStatus = mr.Status.Short() - bref.MatchReason = mr.Reason.Short() - if err := enc.Encode(bref); err != nil { - return err - } - } - return nil - } - ) - zipper := zipkey.New(releases, wiki, keyer, grouper) - return zipper.Run() -} - -// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) -// and will execute gf for each group found. -func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error { - // Define a grouper, working on one set of refs and releases with the same - // key at a time. Here, we do verification and write out the generated - // biblioref. - var ( - enc = json.NewEncoder(w) - keyer = makeKeyFunc("\t", 2) - grouper = func(g *zipkey.Group) error { - if len(g.G0) == 0 || len(g.G1) == 0 { - return nil - } - pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) - if err != nil { - return err - } - for _, line := range g.G1 { - re, err := stringToRelease(lineColumn(line, "\t", 3)) - if err != nil { - return err - } - result := Verify(pivot, re) - switch result.Status { - case StatusExact, StatusStrong: - if result.Reason == ReasonDOI { - continue - } - br := generateBiblioRef(re, pivot, result, "fuzzy") - if err := enc.Encode(br); err != nil { - return err - } - } - } - return nil - } - ) - zipper := zipkey.New(releases, refs, keyer, grouper) - return zipper.Run() -} - -// lineColumn returns a specific column (1-indexed, like cut) from a tabular -// file, returns empty string if column is invalid. -func lineColumn(line, sep string, column int) string { - parts := strings.Split(strings.TrimSpace(line), sep) - if len(parts) < column { - return "" - } else { - return parts[column-1] - } -} - -func stringToRelease(s string) (r *Release, err error) { - err = json.Unmarshal([]byte(s), &r) - return -} - -func stringToRef(s string) (r *Ref, err error) { - err = json.Unmarshal([]byte(s), &r) - return -} - -func stringToWiki(s string) (r *MinimalCitations, err error) { - err = json.Unmarshal([]byte(s), &r) - return -} - // Verify verifies two releases and will ignore short titles. func Verify(a, b *Release) MatchResult { return VerifyMinTitleLength(a, b, 5) |