aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-verify/main.go2
-rw-r--r--skate/verify.go101
2 files changed, 81 insertions, 22 deletions
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
index 7a8ec9a..19146c9 100644
--- a/skate/cmd/skate-verify/main.go
+++ b/skate/cmd/skate-verify/main.go
@@ -110,7 +110,7 @@ func main() {
}
case "ref":
// https://git.io/JtACz
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefCluster)
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
if err := pp.Run(); err != nil {
diff --git a/skate/verify.go b/skate/verify.go
index fa9abd1..1f59514 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -100,6 +100,19 @@ type MatchResult struct {
Reason Reason
}
+// VerificationPair groups two identifiers and their match status and
+// match reason.
+type MatchPair struct {
+ A string
+ B string
+ Result MatchResult
+}
+
+// AsLine returns a TSV line of the match pair.
+func (m *MatchPair) AsLine() string {
+ return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason)
+}
+
var (
PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`)
PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
@@ -110,37 +123,89 @@ var (
PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
)
-// XXX: add all pairs verification (e.g. self-match).
+// jsonMarshalLine marshals a value as JSON and adds a newline.
+func jsonMarshalLine(v interface{}) ([]byte, error) {
+ b, err := json.Marshal(v)
+ if err != nil {
+ return nil, err
+ }
+ b = append(b, []byte("\n")...)
+ return b, nil
+}
-// RefCluster deserialized a single cluster document and returns a tabular file
-// with identifiers, match status and reason.
-func RefCluster(p []byte) ([]byte, error) {
+// ClusterVerifyMaxSize runs verification across all pairs in the cluster. This is a
+// port of https://git.io/JYgOB from fuzzycat.
+func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) {
var (
- cr *ReleaseCluster
+ rc *ReleaseCluster
buf bytes.Buffer
)
- if err := json.Unmarshal(p, &cr); err != nil {
+ if err := json.Unmarshal(p, &rc); err != nil {
return nil, err
}
- pivot, err := cr.OneNonRef()
+ n := len(rc.Values)
+ if n > maxClusterSize {
+ return nil, nil
+ }
+ // O(n^2) ahead, specifically, n * (n-1) / 2.
+ for i := 0; i < n; i++ {
+ for j := i; j < n; j++ {
+ if i == j {
+ continue
+ }
+ a := rc.Values[i]
+ b := rc.Values[j]
+ matchPair := &MatchPair{
+ A: a.Ident,
+ B: b.Ident,
+ Result: Verify(a, b, 5),
+ }
+ if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
+ return nil, err
+ }
+ }
+ }
+ return buf.Bytes(), nil
+}
+
+// ClusterVerify runs verification process across all pairs, but skips clusters
+// containing more than ten elements.
+func ClusterVerify(p []byte) ([]byte, error) {
+ return ClusterVerifyMaxSize(p, 10)
+}
+
+// RefClusterVerify deserialized a cluster document containing both converted
+// references and releases and returns a tabular verification result between
+// one release and all references found.
+func RefClusterVerify(p []byte) ([]byte, error) {
+ var (
+ rc *ReleaseCluster
+ buf bytes.Buffer
+ )
+ if err := json.Unmarshal(p, &rc); err != nil {
+ return nil, err
+ }
+ pivot, err := rc.OneNonRef()
if err != nil {
return nil, err
}
- for _, re := range cr.Values {
+ for _, re := range rc.Values {
if re.Extra.Skate.Status != "ref" {
continue
}
- result := Verify(pivot, re, 5)
- if _, err := fmt.Fprintf(&buf, "%s %s %s %s\n",
- pivot.Ident, re.Ident, result.Status, result.Reason); err != nil {
+ matchPair := &MatchPair{
+ A: pivot.Ident,
+ B: re.Ident,
+ Result: Verify(pivot, re, 5),
+ }
+ if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
return nil, err
}
- // XXX: We can generate a biblioref here, too.
}
return buf.Bytes(), nil
}
-// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches.
+// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only.
func RefClusterToBiblioRef(p []byte) ([]byte, error) {
var (
cr *ReleaseCluster
@@ -162,16 +227,10 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {
switch result.Status {
case StatusExact, StatusStrong:
if result.Reason == ReasonDOI {
- // Assume we already have the DOI matches.
- continue
+ continue // Assume we already have the DOI matches.
}
br = generateBiblioRef(re, pivot, result.Status, result.Reason, "fuzzy")
- b, err := json.Marshal(br)
- if err != nil {
- return nil, err
- }
- b = append(b, []byte("\n")...)
- return b, nil
+ return jsonMarshalLine(br)
default:
continue
}