diff options
Diffstat (limited to 'skate/verify.go')
-rw-r--r-- | skate/verify.go | 180 |
1 files changed, 34 insertions, 146 deletions
diff --git a/skate/verify.go b/skate/verify.go index 5cb56bb..22f0a0d 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -1,5 +1,8 @@ // TODO: The various grouping and verification functions should probably be in // a separate file and it should be obvious how to adjust or write a new one. +// +// This file contains a port of fuzzycat.verify +// (https://gitlab.com/internetarchive/fuzzycat) to Go. //go:generate stringer -type=Status,Reason -output verify_string.go verify.go package skate @@ -7,7 +10,6 @@ package skate import ( "bytes" "fmt" - "io" "regexp" "strconv" "strings" @@ -17,8 +19,6 @@ import ( "github.com/segmentio/encoding/json" ) -// This file contains a port of fuzzycat.verify to Go. - type ( // Status represents match strength. Status int @@ -87,12 +87,22 @@ const ( ReasonYear ) -// Short name. +var ( + PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`) + PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`) + PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`) + PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`) + PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`) + PatDigits = regexp.MustCompile(`\d+`) + PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) +) + +// Short name for status. func (s Status) Short() string { return strings.ToLower(strings.Replace(s.String(), "Status", "", 1)) } -// Short name. +// Short name for reason. func (r Reason) Short() string { return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1)) } @@ -116,16 +126,6 @@ func (m *MatchPair) AsLine() string { return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason) } -var ( - PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`) - PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`) - PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`) - PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`) - PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`) - PatDigits = regexp.MustCompile(`\d+`) - PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) -) - // JsonMarshalNewline marshals a value as JSON and adds a newline. func JsonMarshalNewline(v interface{}) ([]byte, error) { b, err := json.Marshal(v) @@ -136,137 +136,6 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) { return b, nil } -// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a -// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification. -func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { - var ( - rc *ReleaseCluster - buf bytes.Buffer - n int - ) - if err := json.Unmarshal(p, &rc); err != nil { - return nil, err - } - if n = len(rc.Values); n > maxClusterSize { - return nil, nil - } - // O(n^2) ahead, specifically, n * (n-1) / 2. - for i := 0; i < n; i++ { - for j := i; j < n; j++ { - if i == j { - continue - } - a := rc.Values[i] - b := rc.Values[j] - matchPair := &MatchPair{ - A: a.Ident, - B: b.Ident, - Result: Verify(a, b), - } - if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { - return nil, err - } - } - } - return buf.Bytes(), nil -} - -// ClusterVerify runs verification process across all pairs, but skips clusters -// containing more than ten elements. If a cluster has more then 10 elements, -// it might also signal a too ambiguous title. Beside, we do not want this to -// be too slow. -func ClusterVerify(p []byte) ([]byte, error) { - return ClusterVerifyMaxClusterSize(p, 10) -} - -// RefClusterVerify deserializes a cluster document containing both converted -// references and releases and returns a tabular verification result between -// one (any) release and all references found. This depends on refs and releases -// being distinguishable, (e.g. via .extra.skate.status == "ref"). -func RefClusterVerify(p []byte) ([]byte, error) { - var ( - rc *ReleaseCluster - buf bytes.Buffer - pivot, re *Release - err error - ) - if err = json.Unmarshal(p, &rc); err != nil { - return nil, err - } - if pivot, err = rc.OneNonRef(); err != nil { - return nil, err - } - for _, re = range rc.Values { - if re.Extra.Skate.Status != "ref" { - continue - } - matchPair := &MatchPair{ - A: pivot.Ident, - B: re.Ident, - Result: Verify(pivot, re), - } - if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { - return nil, err - } - } - return buf.Bytes(), nil -} - -// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from -// exact and strong matches only. -func RefClusterToBiblioRef(p []byte) ([]byte, error) { - var ( - rc *ReleaseCluster - br *BiblioRef - buf bytes.Buffer - pivot, re *Release - err error - ) - if err = json.Unmarshal(p, &rc); err != nil { - return nil, err - } - if pivot, err = rc.OneNonRef(); err != nil { - return nil, err - } - for _, re = range rc.Values { - if re.Extra.Skate.Status != "ref" { - continue - } - result := Verify(pivot, re) - switch result.Status { - case StatusExact, StatusStrong: - if result.Reason == ReasonDOI { - continue // Assume we already have the DOI matches. - } - br = generateBiblioRef(re, pivot, result, "fuzzy") - return JsonMarshalNewline(br) - default: - // XXX: may want to include non matches here. - continue - } - } - return buf.Bytes(), nil -} - -// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. -func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { - var bref BiblioRef - bref.SourceReleaseIdent = source.Ident - bref.SourceWorkIdent = source.WorkID - bref.SourceReleaseStage = source.ReleaseStage - if source.ReleaseYear() > 1000 { - bref.SourceYear = source.ReleaseYearString() - } - bref.RefIndex = source.Extra.Skate.Ref.Index - bref.RefKey = source.Extra.Skate.Ref.Key - bref.TargetReleaseIdent = target.Ident - bref.TargetWorkIdent = target.WorkID - bref.MatchProvenance = provenance - bref.MatchStatus = matchResult.Status.Short() - bref.MatchReason = matchResult.Reason.Short() - return &bref -} - // Verify verifies two releases and will ignore short titles. func Verify(a, b *Release) MatchResult { return VerifyMinTitleLength(a, b, 5) @@ -542,6 +411,25 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult { } } +// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. +func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { + var bref BiblioRef + bref.SourceReleaseIdent = source.Ident + bref.SourceWorkIdent = source.WorkID + bref.SourceReleaseStage = source.ReleaseStage + if source.ReleaseYear() > 1000 { + bref.SourceYear = source.ReleaseYearString() + } + bref.RefIndex = source.Extra.Skate.Ref.Index + bref.RefKey = source.Extra.Skate.Ref.Key + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = matchResult.Status.Short() + bref.MatchReason = matchResult.Reason.Short() + return &bref +} + type ParsedPages struct { Start int End int |