aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/cmd/skate-reduce/main.go26
-rw-r--r--skate/verify.go180
-rw-r--r--skate/zipkey/zipkey.go10
3 files changed, 39 insertions, 177 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index 2ff7de4..7918a28 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -20,17 +20,6 @@
// | $ skate-reduce -m fuzzy -F a.tsv -L b.tsv
// |
// |
-// * ref | takes a single file with clusters containing releases and refs and
-// | will emit verification results (deprecated).
-// |
-// | $ skate-reduce -m ref < a.ndj
-// |
-// |
-// * bref | same as ref, but generate a biblioref file as output (deprecated).
-// |
-// | $ skate-reduce -m bref < a.ndj
-// |
-// |
// * wiki | zippy mode for releases and wikipedia inputs.
// |
// | $ skate-reduce -m wiki -L a.ndj -W b.ndj
@@ -72,7 +61,6 @@ import (
"runtime"
"git.archive.org/martin/cgraph/skate"
- "git.archive.org/martin/cgraph/skate/parallel"
"git.archive.org/martin/cgraph/skate/xio"
gzip "github.com/klauspost/compress/gzip"
)
@@ -153,20 +141,6 @@ func main() {
if err := skate.ZippyVerifyRefs(l, f, bw); err != nil {
log.Fatal(err)
}
- case "ref":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "bref":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
case "wiki":
l, w, err := xio.OpenTwo(*releases, *wiki)
if err != nil {
diff --git a/skate/verify.go b/skate/verify.go
index 5cb56bb..22f0a0d 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -1,5 +1,8 @@
// TODO: The various grouping and verification functions should probably be in
// a separate file and it should be obvious how to adjust or write a new one.
+//
+// This file contains a port of fuzzycat.verify
+// (https://gitlab.com/internetarchive/fuzzycat) to Go.
//go:generate stringer -type=Status,Reason -output verify_string.go verify.go
package skate
@@ -7,7 +10,6 @@ package skate
import (
"bytes"
"fmt"
- "io"
"regexp"
"strconv"
"strings"
@@ -17,8 +19,6 @@ import (
"github.com/segmentio/encoding/json"
)
-// This file contains a port of fuzzycat.verify to Go.
-
type (
// Status represents match strength.
Status int
@@ -87,12 +87,22 @@ const (
ReasonYear
)
-// Short name.
+var (
+ PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`)
+ PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
+ PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
+ PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
+ PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
+ PatDigits = regexp.MustCompile(`\d+`)
+ PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
+)
+
+// Short name for status.
func (s Status) Short() string {
return strings.ToLower(strings.Replace(s.String(), "Status", "", 1))
}
-// Short name.
+// Short name for reason.
func (r Reason) Short() string {
return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1))
}
@@ -116,16 +126,6 @@ func (m *MatchPair) AsLine() string {
return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason)
}
-var (
- PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`)
- PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`)
- PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`)
- PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`)
- PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`)
- PatDigits = regexp.MustCompile(`\d+`)
- PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
-)
-
// JsonMarshalNewline marshals a value as JSON and adds a newline.
func JsonMarshalNewline(v interface{}) ([]byte, error) {
b, err := json.Marshal(v)
@@ -136,137 +136,6 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
return b, nil
}
-// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
-// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification.
-func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
- var (
- rc *ReleaseCluster
- buf bytes.Buffer
- n int
- )
- if err := json.Unmarshal(p, &rc); err != nil {
- return nil, err
- }
- if n = len(rc.Values); n > maxClusterSize {
- return nil, nil
- }
- // O(n^2) ahead, specifically, n * (n-1) / 2.
- for i := 0; i < n; i++ {
- for j := i; j < n; j++ {
- if i == j {
- continue
- }
- a := rc.Values[i]
- b := rc.Values[j]
- matchPair := &MatchPair{
- A: a.Ident,
- B: b.Ident,
- Result: Verify(a, b),
- }
- if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
- return nil, err
- }
- }
- }
- return buf.Bytes(), nil
-}
-
-// ClusterVerify runs verification process across all pairs, but skips clusters
-// containing more than ten elements. If a cluster has more then 10 elements,
-// it might also signal a too ambiguous title. Beside, we do not want this to
-// be too slow.
-func ClusterVerify(p []byte) ([]byte, error) {
- return ClusterVerifyMaxClusterSize(p, 10)
-}
-
-// RefClusterVerify deserializes a cluster document containing both converted
-// references and releases and returns a tabular verification result between
-// one (any) release and all references found. This depends on refs and releases
-// being distinguishable, (e.g. via .extra.skate.status == "ref").
-func RefClusterVerify(p []byte) ([]byte, error) {
- var (
- rc *ReleaseCluster
- buf bytes.Buffer
- pivot, re *Release
- err error
- )
- if err = json.Unmarshal(p, &rc); err != nil {
- return nil, err
- }
- if pivot, err = rc.OneNonRef(); err != nil {
- return nil, err
- }
- for _, re = range rc.Values {
- if re.Extra.Skate.Status != "ref" {
- continue
- }
- matchPair := &MatchPair{
- A: pivot.Ident,
- B: re.Ident,
- Result: Verify(pivot, re),
- }
- if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil {
- return nil, err
- }
- }
- return buf.Bytes(), nil
-}
-
-// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from
-// exact and strong matches only.
-func RefClusterToBiblioRef(p []byte) ([]byte, error) {
- var (
- rc *ReleaseCluster
- br *BiblioRef
- buf bytes.Buffer
- pivot, re *Release
- err error
- )
- if err = json.Unmarshal(p, &rc); err != nil {
- return nil, err
- }
- if pivot, err = rc.OneNonRef(); err != nil {
- return nil, err
- }
- for _, re = range rc.Values {
- if re.Extra.Skate.Status != "ref" {
- continue
- }
- result := Verify(pivot, re)
- switch result.Status {
- case StatusExact, StatusStrong:
- if result.Reason == ReasonDOI {
- continue // Assume we already have the DOI matches.
- }
- br = generateBiblioRef(re, pivot, result, "fuzzy")
- return JsonMarshalNewline(br)
- default:
- // XXX: may want to include non matches here.
- continue
- }
- }
- return buf.Bytes(), nil
-}
-
-// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
-func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
- var bref BiblioRef
- bref.SourceReleaseIdent = source.Ident
- bref.SourceWorkIdent = source.WorkID
- bref.SourceReleaseStage = source.ReleaseStage
- if source.ReleaseYear() > 1000 {
- bref.SourceYear = source.ReleaseYearString()
- }
- bref.RefIndex = source.Extra.Skate.Ref.Index
- bref.RefKey = source.Extra.Skate.Ref.Key
- bref.TargetReleaseIdent = target.Ident
- bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = matchResult.Status.Short()
- bref.MatchReason = matchResult.Reason.Short()
- return &bref
-}
-
// Verify verifies two releases and will ignore short titles.
func Verify(a, b *Release) MatchResult {
return VerifyMinTitleLength(a, b, 5)
@@ -542,6 +411,25 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult {
}
}
+// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
+func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
+ var bref BiblioRef
+ bref.SourceReleaseIdent = source.Ident
+ bref.SourceWorkIdent = source.WorkID
+ bref.SourceReleaseStage = source.ReleaseStage
+ if source.ReleaseYear() > 1000 {
+ bref.SourceYear = source.ReleaseYearString()
+ }
+ bref.RefIndex = source.Extra.Skate.Ref.Index
+ bref.RefKey = source.Extra.Skate.Ref.Key
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = matchResult.Status.Short()
+ bref.MatchReason = matchResult.Reason.Short()
+ return &bref
+}
+
type ParsedPages struct {
Start int
End int
diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go
index ffd33fe..e5e9f07 100644
--- a/skate/zipkey/zipkey.go
+++ b/skate/zipkey/zipkey.go
@@ -21,10 +21,10 @@ type (
)
// ZipRun reads records (separated by sep) from two readers, extracts a key
-// from each record with a keyFunc and collects records from the two streams
-// into a Group. A callback groupFunc can be registered, which allows to
-// customize the processing of the group. Current limitation: both streams need
-// to use the same keyFunc.
+// from each record with a keyFunc and collects records with the same key from
+// the two streams into a Group. A callback groupFunc can be registered, which
+// allows to customize the processing of the group. Current limitation: both
+// streams need to use the same keyFunc.
type ZipRun struct {
r0, r1 *bufio.Reader
kf keyFunc
@@ -44,7 +44,7 @@ func New(r0, r1 io.Reader, kf keyFunc, gf groupFunc) *ZipRun {
}
// Run starts reading from both readers. The process stops, if one reader is
-// exhausted or reads from any reader fail.
+// exhausted or a read from any reader fails.
func (z *ZipRun) Run() error {
var (
k0, k1, c0, c1 string // key: k0, k1; current line: c0, c1