aboutsummaryrefslogtreecommitdiffstats
path: root/skate/verify.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/verify.go')
-rw-r--r--skate/verify.go180
1 files changed, 34 insertions, 146 deletions
diff --git a/skate/verify.go b/skate/verify.go
index 5cb56bb..22f0a0d 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -1,5 +1,8 @@
// TODO: The various grouping and verification functions should probably be in
// a separate file and it should be obvious how to adjust or write a new one.
+//
+// This file contains a port of fuzzycat.verify
+// (https://gitlab.com/internetarchive/fuzzycat) to Go.
//go:generate stringer -type=Status,Reason -output verify_string.go verify.go
package skate
@@ -7,7 +10,6 @@ package skate
import (
"bytes"
"fmt"
- "io"
"regexp"
"strconv"
"strings"
@@ -17,8 +19,6 @@ import (
"github.com/segmentio/encoding/json"
)
-// This file contains a port of fuzzycat.verify to Go.
-
type (
// Status represents match strength.
Status int
@@ -87,12 +87,22 @@ const (
ReasonYear
)
-// Short name.
+var (
+ PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`)
+ PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
+ PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
+ PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
+ PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
+ PatDigits = regexp.MustCompile(`\d+`)
+ PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
+)
+
+// Short name for status.
func (s Status) Short() string {
return strings.ToLower(strings.Replace(s.String(), "Status", "", 1))
}
-// Short name.
+// Short name for reason.
func (r Reason) Short() string {
return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1))
}
@@ -116,16 +126,6 @@ func (m *MatchPair) AsLine() string {
return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason)
}
-var (
- PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`)
- PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
- PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
- PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
- PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
- PatDigits = regexp.MustCompile(`\d+`)
- PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
-)
-
// JsonMarshalNewline marshals a value as JSON and adds a newline.
func JsonMarshalNewline(v interface{}) ([]byte, error) {
b, err := json.Marshal(v)
@@ -136,137 +136,6 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
return b, nil
}
-// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
-// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification.
-func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
- var (
- rc *ReleaseCluster
- buf bytes.Buffer
- n int
- )
- if err := json.Unmarshal(p, &rc); err != nil {
- return nil, err
- }
- if n = len(rc.Values); n > maxClusterSize {
- return nil, nil
- }
- // O(n^2) ahead, specifically, n * (n-1) / 2.
- for i := 0; i < n; i++ {
- for j := i; j < n; j++ {
- if i == j {
- continue
- }
- a := rc.Values[i]
- b := rc.Values[j]
- matchPair := &MatchPair{
- A: a.Ident,
- B: b.Ident,
- Result: Verify(a, b),
- }
- if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
- return nil, err
- }
- }
- }
- return buf.Bytes(), nil
-}
-
-// ClusterVerify runs verification process across all pairs, but skips clusters
-// containing more than ten elements. If a cluster has more then 10 elements,
-// it might also signal a too ambiguous title. Beside, we do not want this to
-// be too slow.
-func ClusterVerify(p []byte) ([]byte, error) {
- return ClusterVerifyMaxClusterSize(p, 10)
-}
-
-// RefClusterVerify deserializes a cluster document containing both converted
-// references and releases and returns a tabular verification result between
-// one (any) release and all references found. This depends on refs and releases
-// being distinguishable, (e.g. via .extra.skate.status == "ref").
-func RefClusterVerify(p []byte) ([]byte, error) {
- var (
- rc *ReleaseCluster
- buf bytes.Buffer
- pivot, re *Release
- err error
- )
- if err = json.Unmarshal(p, &rc); err != nil {
- return nil, err
- }
- if pivot, err = rc.OneNonRef(); err != nil {
- return nil, err
- }
- for _, re = range rc.Values {
- if re.Extra.Skate.Status != "ref" {
- continue
- }
- matchPair := &MatchPair{
- A: pivot.Ident,
- B: re.Ident,
- Result: Verify(pivot, re),
- }
- if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
- return nil, err
- }
- }
- return buf.Bytes(), nil
-}
-
-// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from
-// exact and strong matches only.
-func RefClusterToBiblioRef(p []byte) ([]byte, error) {
- var (
- rc *ReleaseCluster
- br *BiblioRef
- buf bytes.Buffer
- pivot, re *Release
- err error
- )
- if err = json.Unmarshal(p, &rc); err != nil {
- return nil, err
- }
- if pivot, err = rc.OneNonRef(); err != nil {
- return nil, err
- }
- for _, re = range rc.Values {
- if re.Extra.Skate.Status != "ref" {
- continue
- }
- result := Verify(pivot, re)
- switch result.Status {
- case StatusExact, StatusStrong:
- if result.Reason == ReasonDOI {
- continue // Assume we already have the DOI matches.
- }
- br = generateBiblioRef(re, pivot, result, "fuzzy")
- return JsonMarshalNewline(br)
- default:
- // XXX: may want to include non matches here.
- continue
- }
- }
- return buf.Bytes(), nil
-}
-
-// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
-func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
- var bref BiblioRef
- bref.SourceReleaseIdent = source.Ident
- bref.SourceWorkIdent = source.WorkID
- bref.SourceReleaseStage = source.ReleaseStage
- if source.ReleaseYear() > 1000 {
- bref.SourceYear = source.ReleaseYearString()
- }
- bref.RefIndex = source.Extra.Skate.Ref.Index
- bref.RefKey = source.Extra.Skate.Ref.Key
- bref.TargetReleaseIdent = target.Ident
- bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = matchResult.Status.Short()
- bref.MatchReason = matchResult.Reason.Short()
- return &bref
-}
-
// Verify verifies two releases and will ignore short titles.
func Verify(a, b *Release) MatchResult {
return VerifyMinTitleLength(a, b, 5)
@@ -542,6 +411,25 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult {
}
}
+// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
+func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
+ var bref BiblioRef
+ bref.SourceReleaseIdent = source.Ident
+ bref.SourceWorkIdent = source.WorkID
+ bref.SourceReleaseStage = source.ReleaseStage
+ if source.ReleaseYear() > 1000 {
+ bref.SourceYear = source.ReleaseYearString()
+ }
+ bref.RefIndex = source.Extra.Skate.Ref.Index
+ bref.RefKey = source.Extra.Skate.Ref.Key
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = matchResult.Status.Short()
+ bref.MatchReason = matchResult.Reason.Short()
+ return &bref
+}
+
type ParsedPages struct {
Start int
End int