3 files changed, 39 insertions, 177 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index 2ff7de4..7918a28 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -20,17 +20,6 @@
 //             | $ skate-reduce -m fuzzy -F a.tsv -L b.tsv
 //             |
 //             |
-// * ref       | takes a single file with clusters containing releases and refs and
-//             | will emit verification results (deprecated).
-//             |
-//             | $ skate-reduce -m ref < a.ndj
-//             |
-//             |
-// * bref      | same as ref, but generate a biblioref file as output (deprecated).
-//             |
-//             | $ skate-reduce -m bref < a.ndj
-//             |
-//             |
 // * wiki      | zippy mode for releases and wikipedia inputs.
 //             |
 //             | $ skate-reduce -m wiki -L a.ndj -W b.ndj
@@ -72,7 +61,6 @@ import (
 	"runtime"
 
 	"git.archive.org/martin/cgraph/skate"
-	"git.archive.org/martin/cgraph/skate/parallel"
 	"git.archive.org/martin/cgraph/skate/xio"
 	gzip "github.com/klauspost/compress/gzip"
 )
@@ -153,20 +141,6 @@ func main() {
 		if err := skate.ZippyVerifyRefs(l, f, bw); err != nil {
 			log.Fatal(err)
 		}
-	case "ref":
-		pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
-		pp.NumWorkers = *numWorkers
-		pp.BatchSize = *batchSize
-		if err := pp.Run(); err != nil {
-			log.Fatal(err)
-		}
-	case "bref":
-		pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
-		pp.NumWorkers = *numWorkers
-		pp.BatchSize = *batchSize
-		if err := pp.Run(); err != nil {
-			log.Fatal(err)
-		}
 	case "wiki":
 		l, w, err := xio.OpenTwo(*releases, *wiki)
 		if err != nil {
diff --git a/skate/verify.go b/skate/verify.go
index 5cb56bb..22f0a0d 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -1,5 +1,8 @@
 // TODO: The various grouping and verification functions should probably be in
 // a separate file and it should be obvious how to adjust or write a new one.
+//
+// This file contains a port of fuzzycat.verify
+// (https://gitlab.com/internetarchive/fuzzycat) to Go.
 
 //go:generate stringer -type=Status,Reason -output verify_string.go verify.go
 package skate
@@ -7,7 +10,6 @@ package skate
 import (
 	"bytes"
 	"fmt"
-	"io"
 	"regexp"
 	"strconv"
 	"strings"
@@ -17,8 +19,6 @@ import (
 	"github.com/segmentio/encoding/json"
 )
 
-// This file contains a port of fuzzycat.verify to Go.
-
 type (
 	// Status represents match strength.
 	Status int
@@ -87,12 +87,22 @@ const (
 	ReasonYear
 )
 
-// Short name.
+var (
+	PatAppendix        = regexp.MustCompile(`appendix ?[^ ]*$`)
+	PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
+	PatVersionedDOI    = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
+	PatArxivVersion    = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
+	PatFilenameLike    = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
+	PatDigits          = regexp.MustCompile(`\d+`)
+	PatPages           = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
+)
+
+// Short name for status.
 func (s Status) Short() string {
 	return strings.ToLower(strings.Replace(s.String(), "Status", "", 1))
 }
 
-// Short name.
+// Short name for reason.
 func (r Reason) Short() string {
 	return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1))
 }
@@ -116,16 +126,6 @@ func (m *MatchPair) AsLine() string {
 	return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason)
 }
 
-var (
-	PatAppendix        = regexp.MustCompile(`appendix ?[^ ]*$`)
-	PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
-	PatVersionedDOI    = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
-	PatArxivVersion    = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
-	PatFilenameLike    = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
-	PatDigits          = regexp.MustCompile(`\d+`)
-	PatPages           = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
-)
-
 // JsonMarshalNewline marshals a value as JSON and adds a newline.
 func JsonMarshalNewline(v interface{}) ([]byte, error) {
 	b, err := json.Marshal(v)
@@ -136,137 +136,6 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
 	return b, nil
 }
 
-// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
-// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification.
-func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
-	var (
-		rc  *ReleaseCluster
-		buf bytes.Buffer
-		n   int
-	)
-	if err := json.Unmarshal(p, &rc); err != nil {
-		return nil, err
-	}
-	if n = len(rc.Values); n > maxClusterSize {
-		return nil, nil
-	}
-	// O(n^2) ahead, specifically, n * (n-1) / 2.
-	for i := 0; i < n; i++ {
-		for j := i; j < n; j++ {
-			if i == j {
-				continue
-			}
-			a := rc.Values[i]
-			b := rc.Values[j]
-			matchPair := &MatchPair{
-				A:      a.Ident,
-				B:      b.Ident,
-				Result: Verify(a, b),
-			}
-			if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
-				return nil, err
-			}
-		}
-	}
-	return buf.Bytes(), nil
-}
-
-// ClusterVerify runs verification process across all pairs, but skips clusters
-// containing more than ten elements. If a cluster has more then 10 elements,
-// it might also signal a too ambiguous title. Beside, we do not want this to
-// be too slow.
-func ClusterVerify(p []byte) ([]byte, error) {
-	return ClusterVerifyMaxClusterSize(p, 10)
-}
-
-// RefClusterVerify deserializes a cluster document containing both converted
-// references and releases and returns a tabular verification result between
-// one (any) release and all references found. This depends on refs and releases
-// being distinguishable, (e.g. via .extra.skate.status == "ref").
-func RefClusterVerify(p []byte) ([]byte, error) {
-	var (
-		rc        *ReleaseCluster
-		buf       bytes.Buffer
-		pivot, re *Release
-		err       error
-	)
-	if err = json.Unmarshal(p, &rc); err != nil {
-		return nil, err
-	}
-	if pivot, err = rc.OneNonRef(); err != nil {
-		return nil, err
-	}
-	for _, re = range rc.Values {
-		if re.Extra.Skate.Status != "ref" {
-			continue
-		}
-		matchPair := &MatchPair{
-			A:      pivot.Ident,
-			B:      re.Ident,
-			Result: Verify(pivot, re),
-		}
-		if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
-			return nil, err
-		}
-	}
-	return buf.Bytes(), nil
-}
-
-// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from
-// exact and strong matches only.
-func RefClusterToBiblioRef(p []byte) ([]byte, error) {
-	var (
-		rc        *ReleaseCluster
-		br        *BiblioRef
-		buf       bytes.Buffer
-		pivot, re *Release
-		err       error
-	)
-	if err = json.Unmarshal(p, &rc); err != nil {
-		return nil, err
-	}
-	if pivot, err = rc.OneNonRef(); err != nil {
-		return nil, err
-	}
-	for _, re = range rc.Values {
-		if re.Extra.Skate.Status != "ref" {
-			continue
-		}
-		result := Verify(pivot, re)
-		switch result.Status {
-		case StatusExact, StatusStrong:
-			if result.Reason == ReasonDOI {
-				continue // Assume we already have the DOI matches.
-			}
-			br = generateBiblioRef(re, pivot, result, "fuzzy")
-			return JsonMarshalNewline(br)
-		default:
-			// XXX: may want to include non matches here.
-			continue
-		}
-	}
-	return buf.Bytes(), nil
-}
-
-// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
-func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
-	var bref BiblioRef
-	bref.SourceReleaseIdent = source.Ident
-	bref.SourceWorkIdent = source.WorkID
-	bref.SourceReleaseStage = source.ReleaseStage
-	if source.ReleaseYear() > 1000 {
-		bref.SourceYear = source.ReleaseYearString()
-	}
-	bref.RefIndex = source.Extra.Skate.Ref.Index
-	bref.RefKey = source.Extra.Skate.Ref.Key
-	bref.TargetReleaseIdent = target.Ident
-	bref.TargetWorkIdent = target.WorkID
-	bref.MatchProvenance = provenance
-	bref.MatchStatus = matchResult.Status.Short()
-	bref.MatchReason = matchResult.Reason.Short()
-	return &bref
-}
-
 // Verify verifies two releases and will ignore short titles.
 func Verify(a, b *Release) MatchResult {
 	return VerifyMinTitleLength(a, b, 5)
@@ -542,6 +411,25 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult {
 	}
 }
 
+// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
+func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
+	var bref BiblioRef
+	bref.SourceReleaseIdent = source.Ident
+	bref.SourceWorkIdent = source.WorkID
+	bref.SourceReleaseStage = source.ReleaseStage
+	if source.ReleaseYear() > 1000 {
+		bref.SourceYear = source.ReleaseYearString()
+	}
+	bref.RefIndex = source.Extra.Skate.Ref.Index
+	bref.RefKey = source.Extra.Skate.Ref.Key
+	bref.TargetReleaseIdent = target.Ident
+	bref.TargetWorkIdent = target.WorkID
+	bref.MatchProvenance = provenance
+	bref.MatchStatus = matchResult.Status.Short()
+	bref.MatchReason = matchResult.Reason.Short()
+	return &bref
+}
+
 type ParsedPages struct {
 	Start int
 	End   int
diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go
index ffd33fe..e5e9f07 100644
--- a/skate/zipkey/zipkey.go
+++ b/skate/zipkey/zipkey.go
@@ -21,10 +21,10 @@ type (
 )
 
 // ZipRun reads records (separated by sep) from two readers, extracts a key
-// from each record with a keyFunc and collects records from the two streams
-// into a Group. A callback groupFunc can be registered, which allows to
-// customize the processing of the group. Current limitation: both streams need
-// to use the same keyFunc.
+// from each record with a keyFunc and collects records with the same key from
+// the two streams into a Group. A callback groupFunc can be registered, which
+// allows to customize the processing of the group. Current limitation: both
+// streams need to use the same keyFunc.
 type ZipRun struct {
 	r0, r1 *bufio.Reader
 	kf     keyFunc
@@ -44,7 +44,7 @@ func New(r0, r1 io.Reader, kf keyFunc, gf groupFunc) *ZipRun {
 }
 
 // Run starts reading from both readers. The process stops, if one reader is
-// exhausted or reads from any reader fail.
+// exhausted or a read from any reader fails.
 func (z *ZipRun) Run() error {
 	var (
 		k0, k1, c0, c1 string // key: k0, k1; current line: c0, c1