From 912dd621a6ac66f86b2ad32df8db6ebc1570c0f3 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 22 Jul 2021 00:23:00 +0200 Subject: cleanup (old) clustering related code --- skate/cmd/skate-reduce/main.go | 26 ------ skate/verify.go | 180 ++++++++--------------------------------- skate/zipkey/zipkey.go | 10 +-- 3 files changed, 39 insertions(+), 177 deletions(-) diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index 2ff7de4..7918a28 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -20,17 +20,6 @@ // | $ skate-reduce -m fuzzy -F a.tsv -L b.tsv // | // | -// * ref | takes a single file with clusters containing releases and refs and -// | will emit verification results (deprecated). -// | -// | $ skate-reduce -m ref < a.ndj -// | -// | -// * bref | same as ref, but generate a biblioref file as output (deprecated). -// | -// | $ skate-reduce -m bref < a.ndj -// | -// | // * wiki | zippy mode for releases and wikipedia inputs. // | // | $ skate-reduce -m wiki -L a.ndj -W b.ndj @@ -72,7 +61,6 @@ import ( "runtime" "git.archive.org/martin/cgraph/skate" - "git.archive.org/martin/cgraph/skate/parallel" "git.archive.org/martin/cgraph/skate/xio" gzip "github.com/klauspost/compress/gzip" ) @@ -153,20 +141,6 @@ func main() { if err := skate.ZippyVerifyRefs(l, f, bw); err != nil { log.Fatal(err) } - case "ref": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "bref": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } case "wiki": l, w, err := xio.OpenTwo(*releases, *wiki) if err != nil { diff --git a/skate/verify.go b/skate/verify.go index 5cb56bb..22f0a0d 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -1,5 +1,8 @@ // TODO: The various grouping and verification functions should probably be in // a separate file and it should be obvious how to adjust or write a new one. +// +// This file contains a port of fuzzycat.verify +// (https://gitlab.com/internetarchive/fuzzycat) to Go. //go:generate stringer -type=Status,Reason -output verify_string.go verify.go package skate @@ -7,7 +10,6 @@ package skate import ( "bytes" "fmt" - "io" "regexp" "strconv" "strings" @@ -17,8 +19,6 @@ import ( "github.com/segmentio/encoding/json" ) -// This file contains a port of fuzzycat.verify to Go. - type ( // Status represents match strength. Status int @@ -87,12 +87,22 @@ const ( ReasonYear ) -// Short name. +var ( + PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`) + PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`) + PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`) + PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`) + PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`) + PatDigits = regexp.MustCompile(`\d+`) + PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) +) + +// Short name for status. func (s Status) Short() string { return strings.ToLower(strings.Replace(s.String(), "Status", "", 1)) } -// Short name. +// Short name for reason. func (r Reason) Short() string { return strings.ToLower(strings.Replace(r.String(), "Reason", "", 1)) } @@ -116,16 +126,6 @@ func (m *MatchPair) AsLine() string { return fmt.Sprintf("%s\t%s\t%s\t%s\n", m.A, m.B, m.Result.Status, m.Result.Reason) } -var ( - PatAppendix = regexp.MustCompile(`appendix ?[^ ]*$`) - PatFigshareVersion = regexp.MustCompile(`[.]v[0-9]+$`) - PatVersionedDOI = regexp.MustCompile(`10[.].*/v[0-9]{1,}$`) - PatArxivVersion = regexp.MustCompile(`(.*)v[0-9]{1,2}$`) - PatFilenameLike = regexp.MustCompile(`.*[.][a-z]{2,3}$`) - PatDigits = regexp.MustCompile(`\d+`) - PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`) -) - // JsonMarshalNewline marshals a value as JSON and adds a newline. func JsonMarshalNewline(v interface{}) ([]byte, error) { b, err := json.Marshal(v) @@ -136,137 +136,6 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) { return b, nil } -// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a -// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification. -func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { - var ( - rc *ReleaseCluster - buf bytes.Buffer - n int - ) - if err := json.Unmarshal(p, &rc); err != nil { - return nil, err - } - if n = len(rc.Values); n > maxClusterSize { - return nil, nil - } - // O(n^2) ahead, specifically, n * (n-1) / 2. - for i := 0; i < n; i++ { - for j := i; j < n; j++ { - if i == j { - continue - } - a := rc.Values[i] - b := rc.Values[j] - matchPair := &MatchPair{ - A: a.Ident, - B: b.Ident, - Result: Verify(a, b), - } - if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { - return nil, err - } - } - } - return buf.Bytes(), nil -} - -// ClusterVerify runs verification process across all pairs, but skips clusters -// containing more than ten elements. If a cluster has more then 10 elements, -// it might also signal a too ambiguous title. Beside, we do not want this to -// be too slow. -func ClusterVerify(p []byte) ([]byte, error) { - return ClusterVerifyMaxClusterSize(p, 10) -} - -// RefClusterVerify deserializes a cluster document containing both converted -// references and releases and returns a tabular verification result between -// one (any) release and all references found. This depends on refs and releases -// being distinguishable, (e.g. via .extra.skate.status == "ref"). -func RefClusterVerify(p []byte) ([]byte, error) { - var ( - rc *ReleaseCluster - buf bytes.Buffer - pivot, re *Release - err error - ) - if err = json.Unmarshal(p, &rc); err != nil { - return nil, err - } - if pivot, err = rc.OneNonRef(); err != nil { - return nil, err - } - for _, re = range rc.Values { - if re.Extra.Skate.Status != "ref" { - continue - } - matchPair := &MatchPair{ - A: pivot.Ident, - B: re.Ident, - Result: Verify(pivot, re), - } - if _, err := io.WriteString(&buf, matchPair.AsLine()); err != nil { - return nil, err - } - } - return buf.Bytes(), nil -} - -// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from -// exact and strong matches only. -func RefClusterToBiblioRef(p []byte) ([]byte, error) { - var ( - rc *ReleaseCluster - br *BiblioRef - buf bytes.Buffer - pivot, re *Release - err error - ) - if err = json.Unmarshal(p, &rc); err != nil { - return nil, err - } - if pivot, err = rc.OneNonRef(); err != nil { - return nil, err - } - for _, re = range rc.Values { - if re.Extra.Skate.Status != "ref" { - continue - } - result := Verify(pivot, re) - switch result.Status { - case StatusExact, StatusStrong: - if result.Reason == ReasonDOI { - continue // Assume we already have the DOI matches. - } - br = generateBiblioRef(re, pivot, result, "fuzzy") - return JsonMarshalNewline(br) - default: - // XXX: may want to include non matches here. - continue - } - } - return buf.Bytes(), nil -} - -// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. -func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { - var bref BiblioRef - bref.SourceReleaseIdent = source.Ident - bref.SourceWorkIdent = source.WorkID - bref.SourceReleaseStage = source.ReleaseStage - if source.ReleaseYear() > 1000 { - bref.SourceYear = source.ReleaseYearString() - } - bref.RefIndex = source.Extra.Skate.Ref.Index - bref.RefKey = source.Extra.Skate.Ref.Key - bref.TargetReleaseIdent = target.Ident - bref.TargetWorkIdent = target.WorkID - bref.MatchProvenance = provenance - bref.MatchStatus = matchResult.Status.Short() - bref.MatchReason = matchResult.Reason.Short() - return &bref -} - // Verify verifies two releases and will ignore short titles. func Verify(a, b *Release) MatchResult { return VerifyMinTitleLength(a, b, 5) @@ -542,6 +411,25 @@ func VerifyMinTitleLength(a, b *Release, minTitleLength int) MatchResult { } } +// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd. +func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { + var bref BiblioRef + bref.SourceReleaseIdent = source.Ident + bref.SourceWorkIdent = source.WorkID + bref.SourceReleaseStage = source.ReleaseStage + if source.ReleaseYear() > 1000 { + bref.SourceYear = source.ReleaseYearString() + } + bref.RefIndex = source.Extra.Skate.Ref.Index + bref.RefKey = source.Extra.Skate.Ref.Key + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = matchResult.Status.Short() + bref.MatchReason = matchResult.Reason.Short() + return &bref +} + type ParsedPages struct { Start int End int diff --git a/skate/zipkey/zipkey.go b/skate/zipkey/zipkey.go index ffd33fe..e5e9f07 100644 --- a/skate/zipkey/zipkey.go +++ b/skate/zipkey/zipkey.go @@ -21,10 +21,10 @@ type ( ) // ZipRun reads records (separated by sep) from two readers, extracts a key -// from each record with a keyFunc and collects records from the two streams -// into a Group. A callback groupFunc can be registered, which allows to -// customize the processing of the group. Current limitation: both streams need -// to use the same keyFunc. +// from each record with a keyFunc and collects records with the same key from +// the two streams into a Group. A callback groupFunc can be registered, which +// allows to customize the processing of the group. Current limitation: both +// streams need to use the same keyFunc. type ZipRun struct { r0, r1 *bufio.Reader kf keyFunc @@ -44,7 +44,7 @@ func New(r0, r1 io.Reader, kf keyFunc, gf groupFunc) *ZipRun { } // Run starts reading from both readers. The process stops, if one reader is -// exhausted or reads from any reader fail. +// exhausted or a read from any reader fails. func (z *ZipRun) Run() error { var ( k0, k1, c0, c1 string // key: k0, k1; current line: c0, c1 -- cgit v1.2.3