aboutsummaryrefslogtreecommitdiffstats
path: root/skate/verify.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/verify.go')
-rw-r--r--skate/verify.go176
1 files changed, 10 insertions, 166 deletions
diff --git a/skate/verify.go b/skate/verify.go
index 5367ffe..18b2f4e 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -14,11 +14,10 @@ import (
"unicode/utf8"
"git.archive.org/martin/cgraph/skate/set"
- "git.archive.org/martin/cgraph/skate/zipkey"
json "github.com/segmentio/encoding/json"
)
-// This file is a port of fuzzycat.verify to Go.
+// This file contains a port of fuzzycat.verify to Go.
type (
// Status represents match strength.
@@ -135,7 +134,7 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
}
// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
-// port of https://git.io/JYgOB from fuzzycat.
+// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification.
func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
var (
rc *ReleaseCluster
@@ -170,14 +169,16 @@ func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
}
// ClusterVerify runs verification process across all pairs, but skips clusters
-// containing more than ten elements.
+// containing more than ten elements. If a cluster has more then 10 elements,
+// it might also signal a too ambiguous title. Beside, we do not want this to
+// be too slow.
func ClusterVerify(p []byte) ([]byte, error) {
return ClusterVerifyMaxClusterSize(p, 10)
}
// RefClusterVerify deserializes a cluster document containing both converted
// references and releases and returns a tabular verification result between
-// one release and all references found. This depends on refs and releases
+// one (any) release and all references found. This depends on refs and releases
// being distinguishable, (e.g. via .extra.skate.status == "ref").
func RefClusterVerify(p []byte) ([]byte, error) {
var (
@@ -208,7 +209,8 @@ func RefClusterVerify(p []byte) ([]byte, error) {
return buf.Bytes(), nil
}
-// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only.
+// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from
+// exact and strong matches only.
func RefClusterToBiblioRef(p []byte) ([]byte, error) {
var (
rc *ReleaseCluster
@@ -236,13 +238,14 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {
br = generateBiblioRef(re, pivot, result, "fuzzy")
return JsonMarshalNewline(br)
default:
+ // XXX: may want to include non matches here.
continue
}
}
return buf.Bytes(), nil
}
-// generateBiblioRef generates a bibliographic schema document.
+// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
var bref BiblioRef
bref.SourceReleaseIdent = source.Ident
@@ -261,165 +264,6 @@ func generateBiblioRef(source, target *Release, matchResult MatchResult, provena
return &bref
}
-// makeKeyFunc creates a function that can be used as keyFunc, selecting a
-// column from sep.
-func makeKeyFunc(sep string, column int) func(string) (string, error) {
- return func(s string) (string, error) {
- if k := lineColumn(s, "\t", 2); k == "" {
- return k, fmt.Errorf("cannot get key: %s", s)
- } else {
- return k, nil
- }
- }
-}
-
-// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc)
-// and assigns a fixed match result.
-func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error {
- var (
- enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
- grouper = func(g *zipkey.Group) error {
- if len(g.G0) == 0 || len(g.G1) == 0 {
- return nil
- }
- target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
- if err != nil {
- return err
- }
- for _, line := range g.G1 {
- ref, err := stringToRef(lineColumn(line, "\t", 3))
- if err != nil {
- return err
- }
- var bref BiblioRef
- bref.SourceReleaseIdent = ref.ReleaseIdent
- bref.SourceWorkIdent = ref.WorkIdent
- bref.SourceReleaseStage = ref.ReleaseStage
- bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
- bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
- bref.RefKey = ref.Key
- bref.TargetReleaseIdent = target.Ident
- bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = mr.Status.Short()
- bref.MatchReason = mr.Reason.Short()
- if err := enc.Encode(bref); err != nil {
- return err
- }
- }
- return nil
- }
- )
- zipper := zipkey.New(releases, refs, keyer, grouper)
- return zipper.Run()
-}
-
-// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc)
-// and assigns a fixed match result.
-func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error {
- var (
- enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
- grouper = func(g *zipkey.Group) error {
- if len(g.G0) == 0 || len(g.G1) == 0 {
- return nil
- }
- target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
- if err != nil {
- return err
- }
- for _, line := range g.G1 {
- wiki, err := stringToWiki(lineColumn(line, "\t", 3))
- if err != nil {
- return err
- }
- var bref BiblioRef
- bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
- bref.SourceWikipediaArticle = wiki.PageTitle
- bref.TargetReleaseIdent = target.Ident
- bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = mr.Status.Short()
- bref.MatchReason = mr.Reason.Short()
- if err := enc.Encode(bref); err != nil {
- return err
- }
- }
- return nil
- }
- )
- zipper := zipkey.New(releases, wiki, keyer, grouper)
- return zipper.Run()
-}
-
-// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
-// and will execute gf for each group found.
-func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error {
- // Define a grouper, working on one set of refs and releases with the same
- // key at a time. Here, we do verification and write out the generated
- // biblioref.
- var (
- enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
- grouper = func(g *zipkey.Group) error {
- if len(g.G0) == 0 || len(g.G1) == 0 {
- return nil
- }
- pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
- if err != nil {
- return err
- }
- for _, line := range g.G1 {
- re, err := stringToRelease(lineColumn(line, "\t", 3))
- if err != nil {
- return err
- }
- result := Verify(pivot, re)
- switch result.Status {
- case StatusExact, StatusStrong:
- if result.Reason == ReasonDOI {
- continue
- }
- br := generateBiblioRef(re, pivot, result, "fuzzy")
- if err := enc.Encode(br); err != nil {
- return err
- }
- }
- }
- return nil
- }
- )
- zipper := zipkey.New(releases, refs, keyer, grouper)
- return zipper.Run()
-}
-
-// lineColumn returns a specific column (1-indexed, like cut) from a tabular
-// file, returns empty string if column is invalid.
-func lineColumn(line, sep string, column int) string {
- parts := strings.Split(strings.TrimSpace(line), sep)
- if len(parts) < column {
- return ""
- } else {
- return parts[column-1]
- }
-}
-
-func stringToRelease(s string) (r *Release, err error) {
- err = json.Unmarshal([]byte(s), &r)
- return
-}
-
-func stringToRef(s string) (r *Ref, err error) {
- err = json.Unmarshal([]byte(s), &r)
- return
-}
-
-func stringToWiki(s string) (r *MinimalCitations, err error) {
- err = json.Unmarshal([]byte(s), &r)
- return
-}
-
// Verify verifies two releases and will ignore short titles.
func Verify(a, b *Release) MatchResult {
return VerifyMinTitleLength(a, b, 5)