From ac7548d85b414f1d34e13ef5ec46af4ad647040f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 31 Mar 2021 14:18:34 +0200 Subject: refine verification --- skate/cmd/skate-verify/main.go | 2 +- skate/verify.go | 101 ++++++++++++++++++++++++++++++++--------- 2 files changed, 81 insertions(+), 22 deletions(-) (limited to 'skate') diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go index 7a8ec9a..19146c9 100644 --- a/skate/cmd/skate-verify/main.go +++ b/skate/cmd/skate-verify/main.go @@ -110,7 +110,7 @@ func main() { } case "ref": // https://git.io/JtACz - pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefCluster) + pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { diff --git a/skate/verify.go b/skate/verify.go index fa9abd1..1f59514 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -100,6 +100,19 @@ type MatchResult struct { Reason Reason } +// VerificationPair groups two identifiers and their match status and +// match reason. +type MatchPair struct { + A string + B string + Result MatchResult +} + +// AsLine returns a TSV line of the match pair. +func (m *MatchPair) AsLine() string { + return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason) +} + var ( PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`) PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`) @@ -110,37 +123,89 @@ var ( PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) ) -// XXX: add all pairs verification (e.g. self-match). +// jsonMarshalLine marshals a value as JSON and adds a newline. +func jsonMarshalLine(v interface{}) ([]byte, error) { + b, err := json.Marshal(v) + if err != nil { + return nil, err + } + b = append(b, []byte("\n")...) + return b, nil +} -// RefCluster deserialized a single cluster document and returns a tabular file -// with identifiers, match status and reason. -func RefCluster(p []byte) ([]byte, error) { +// ClusterVerifyMaxSize runs verification across all pairs in the cluster. This is a +// port of https://git.io/JYgOB from fuzzycat. +func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) { var ( - cr *ReleaseCluster + rc *ReleaseCluster buf bytes.Buffer ) - if err := json.Unmarshal(p, &cr); err != nil { + if err := json.Unmarshal(p, &rc); err != nil { return nil, err } - pivot, err := cr.OneNonRef() + n := len(rc.Values) + if n > maxClusterSize { + return nil, nil + } + // O(n^2) ahead, specifically, n * (n-1) / 2. + for i := 0; i < n; i++ { + for j := i; j < n; j++ { + if i == j { + continue + } + a := rc.Values[i] + b := rc.Values[j] + matchPair := &MatchPair{ + A: a.Ident, + B: b.Ident, + Result: Verify(a, b, 5), + } + if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { + return nil, err + } + } + } + return buf.Bytes(), nil +} + +// ClusterVerify runs verification process across all pairs, but skips clusters +// containing more than ten elements. +func ClusterVerify(p []byte) ([]byte, error) { + return ClusterVerifyMaxSize(p, 10) +} + +// RefClusterVerify deserialized a cluster document containing both converted +// references and releases and returns a tabular verification result between +// one release and all references found. +func RefClusterVerify(p []byte) ([]byte, error) { + var ( + rc *ReleaseCluster + buf bytes.Buffer + ) + if err := json.Unmarshal(p, &rc); err != nil { + return nil, err + } + pivot, err := rc.OneNonRef() if err != nil { return nil, err } - for _, re := range cr.Values { + for _, re := range rc.Values { if re.Extra.Skate.Status != "ref" { continue } - result := Verify(pivot, re, 5) - if _, err := fmt.Fprintf(&buf, "%s %s %s %s\n", - pivot.Ident, re.Ident, result.Status, result.Reason); err != nil { + matchPair := &MatchPair{ + A: pivot.Ident, + B: re.Ident, + Result: Verify(pivot, re, 5), + } + if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { return nil, err } - // XXX: We can generate a biblioref here, too. } return buf.Bytes(), nil } -// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches. +// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only. func RefClusterToBiblioRef(p []byte) ([]byte, error) { var ( cr *ReleaseCluster @@ -162,16 +227,10 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { switch result.Status { case StatusExact, StatusStrong: if result.Reason == ReasonDOI { - // Assume we already have the DOI matches. - continue + continue // Assume we already have the DOI matches. } br = generateBiblioRef(re, pivot, result.Status, result.Reason, "fuzzy") - b, err := json.Marshal(br) - if err != nil { - return nil, err - } - b = append(b, []byte("\n")...) - return b, nil + return jsonMarshalLine(br) default: continue } -- cgit v1.2.3