diff options
Diffstat (limited to 'skate/verify.go')
| -rw-r--r-- | skate/verify.go | 176 | 
1 files changed, 10 insertions, 166 deletions
| diff --git a/skate/verify.go b/skate/verify.go index 5367ffe..18b2f4e 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -14,11 +14,10 @@ import (  	"unicode/utf8"  	"git.archive.org/martin/cgraph/skate/set" -	"git.archive.org/martin/cgraph/skate/zipkey"  	json "github.com/segmentio/encoding/json"  ) -// This file is a port of fuzzycat.verify to Go. +// This file contains a port of fuzzycat.verify to Go.  type (  	// Status represents match strength. @@ -135,7 +134,7 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {  }  // ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a -// port of https://git.io/JYgOB from fuzzycat. +// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification.  func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {  	var (  		rc  *ReleaseCluster @@ -170,14 +169,16 @@ func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {  }  // ClusterVerify runs verification process across all pairs, but skips clusters -// containing more than ten elements. +// containing more than ten elements. If a cluster has more then 10 elements, +// it might also signal a too ambiguous title. Beside, we do not want this to +// be too slow.  func ClusterVerify(p []byte) ([]byte, error) {  	return ClusterVerifyMaxClusterSize(p, 10)  }  // RefClusterVerify deserializes a cluster document containing both converted  // references and releases and returns a tabular verification result between -// one release and all references found. This depends on refs and releases +// one (any) release and all references found. This depends on refs and releases  // being distinguishable, (e.g. via .extra.skate.status == "ref").  func RefClusterVerify(p []byte) ([]byte, error) {  	var ( @@ -208,7 +209,8 @@ func RefClusterVerify(p []byte) ([]byte, error) {  	return buf.Bytes(), nil  } -// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only. +// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from +// exact and strong matches only.  func RefClusterToBiblioRef(p []byte) ([]byte, error) {  	var (  		rc        *ReleaseCluster @@ -236,13 +238,14 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {  			br = generateBiblioRef(re, pivot, result, "fuzzy")  			return JsonMarshalNewline(br)  		default: +			// XXX: may want to include non matches here.  			continue  		}  	}  	return buf.Bytes(), nil  } -// generateBiblioRef generates a bibliographic schema document. +// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.  func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {  	var bref BiblioRef  	bref.SourceReleaseIdent = source.Ident @@ -261,165 +264,6 @@ func generateBiblioRef(source, target *Release, matchResult MatchResult, provena  	return &bref  } -// makeKeyFunc creates a function that can be used as keyFunc, selecting a -// column from sep. -func makeKeyFunc(sep string, column int) func(string) (string, error) { -	return func(s string) (string, error) { -		if k := lineColumn(s, "\t", 2); k == "" { -			return k, fmt.Errorf("cannot get key: %s", s) -		} else { -			return k, nil -		} -	} -} - -// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc) -// and assigns a fixed match result. -func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error { -	var ( -		enc     = json.NewEncoder(w) -		keyer   = makeKeyFunc("\t", 2) -		grouper = func(g *zipkey.Group) error { -			if len(g.G0) == 0 || len(g.G1) == 0 { -				return nil -			} -			target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) -			if err != nil { -				return err -			} -			for _, line := range g.G1 { -				ref, err := stringToRef(lineColumn(line, "\t", 3)) -				if err != nil { -					return err -				} -				var bref BiblioRef -				bref.SourceReleaseIdent = ref.ReleaseIdent -				bref.SourceWorkIdent = ref.WorkIdent -				bref.SourceReleaseStage = ref.ReleaseStage -				bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) -				bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) -				bref.RefKey = ref.Key -				bref.TargetReleaseIdent = target.Ident -				bref.TargetWorkIdent = target.WorkID -				bref.MatchProvenance = provenance -				bref.MatchStatus = mr.Status.Short() -				bref.MatchReason = mr.Reason.Short() -				if err := enc.Encode(bref); err != nil { -					return err -				} -			} -			return nil -		} -	) -	zipper := zipkey.New(releases, refs, keyer, grouper) -	return zipper.Run() -} - -// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc) -// and assigns a fixed match result. -func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error { -	var ( -		enc     = json.NewEncoder(w) -		keyer   = makeKeyFunc("\t", 2) -		grouper = func(g *zipkey.Group) error { -			if len(g.G0) == 0 || len(g.G1) == 0 { -				return nil -			} -			target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) -			if err != nil { -				return err -			} -			for _, line := range g.G1 { -				wiki, err := stringToWiki(lineColumn(line, "\t", 3)) -				if err != nil { -					return err -				} -				var bref BiblioRef -				bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? -				bref.SourceWikipediaArticle = wiki.PageTitle -				bref.TargetReleaseIdent = target.Ident -				bref.TargetWorkIdent = target.WorkID -				bref.MatchProvenance = provenance -				bref.MatchStatus = mr.Status.Short() -				bref.MatchReason = mr.Reason.Short() -				if err := enc.Encode(bref); err != nil { -					return err -				} -			} -			return nil -		} -	) -	zipper := zipkey.New(releases, wiki, keyer, grouper) -	return zipper.Run() -} - -// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) -// and will execute gf for each group found. -func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error { -	// Define a grouper, working on one set of refs and releases with the same -	// key at a time. Here, we do verification and write out the generated -	// biblioref. -	var ( -		enc     = json.NewEncoder(w) -		keyer   = makeKeyFunc("\t", 2) -		grouper = func(g *zipkey.Group) error { -			if len(g.G0) == 0 || len(g.G1) == 0 { -				return nil -			} -			pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) -			if err != nil { -				return err -			} -			for _, line := range g.G1 { -				re, err := stringToRelease(lineColumn(line, "\t", 3)) -				if err != nil { -					return err -				} -				result := Verify(pivot, re) -				switch result.Status { -				case StatusExact, StatusStrong: -					if result.Reason == ReasonDOI { -						continue -					} -					br := generateBiblioRef(re, pivot, result, "fuzzy") -					if err := enc.Encode(br); err != nil { -						return err -					} -				} -			} -			return nil -		} -	) -	zipper := zipkey.New(releases, refs, keyer, grouper) -	return zipper.Run() -} - -// lineColumn returns a specific column (1-indexed, like cut) from a tabular -// file, returns empty string if column is invalid. -func lineColumn(line, sep string, column int) string { -	parts := strings.Split(strings.TrimSpace(line), sep) -	if len(parts) < column { -		return "" -	} else { -		return parts[column-1] -	} -} - -func stringToRelease(s string) (r *Release, err error) { -	err = json.Unmarshal([]byte(s), &r) -	return -} - -func stringToRef(s string) (r *Ref, err error) { -	err = json.Unmarshal([]byte(s), &r) -	return -} - -func stringToWiki(s string) (r *MinimalCitations, err error) { -	err = json.Unmarshal([]byte(s), &r) -	return -} -  // Verify verifies two releases and will ignore short titles.  func Verify(a, b *Release) MatchResult {  	return VerifyMinTitleLength(a, b, 5) | 
