aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-05 16:54:48 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-05 16:54:48 +0200
commitc801b2ff0aa93489eed89ddb1f2d62404fc89ca2 (patch)
tree7e5249666f3a9217c58329bb03ac60d84d98e675
parent0e26ef6b1b7998198bc92ac9890f6fe42c86a45f (diff)
downloadrefcat-c801b2ff0aa93489eed89ddb1f2d62404fc89ca2.tar.gz
refcat-c801b2ff0aa93489eed89ddb1f2d62404fc89ca2.zip
split functionality up a bit
-rw-r--r--skate/cmd/skate-verify/main.go6
-rw-r--r--skate/schema.go2
-rw-r--r--skate/verify.go176
-rw-r--r--skate/zippy.go171
4 files changed, 185 insertions, 170 deletions
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
index 3b40488..895d508 100644
--- a/skate/cmd/skate-verify/main.go
+++ b/skate/cmd/skate-verify/main.go
@@ -81,7 +81,7 @@ func main() {
if !ok {
mr = matchResults["unknown"]
}
- if err := skate.ZipUnverified(f, g, mr, *provenance, bw); err != nil {
+ if err := skate.ZippyFixed(f, g, mr, *provenance, bw); err != nil {
log.Fatal(err)
}
case "zip":
@@ -102,7 +102,7 @@ func main() {
defer g.Close()
bw := bufio.NewWriter(os.Stdout)
defer bw.Flush()
- if err := skate.ZipVerifyRefs(f, g, bw); err != nil {
+ if err := skate.ZippyVerifyRefs(f, g, bw); err != nil {
log.Fatal(err)
}
case "ref":
@@ -138,7 +138,7 @@ func main() {
defer g.Close()
bw := bufio.NewWriter(os.Stdout)
defer bw.Flush()
- if err := skate.ZipWikiUnverified(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil {
+ if err := skate.ZippyFixedWiki(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil {
log.Fatal(err)
}
default:
diff --git a/skate/schema.go b/skate/schema.go
index 1878205..52aa91a 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -305,7 +305,7 @@ type BiblioRef struct {
}
// ReleaseCluster, a list of match candidates. This is typically serialized as a
-// single JSON line.
+// single JSON line containing the match key and a list of release documents.
type ReleaseCluster struct {
Key string `json:"k"`
Values []*Release `json:"v"`
diff --git a/skate/verify.go b/skate/verify.go
index 5367ffe..18b2f4e 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -14,11 +14,10 @@ import (
"unicode/utf8"
"git.archive.org/martin/cgraph/skate/set"
- "git.archive.org/martin/cgraph/skate/zipkey"
json "github.com/segmentio/encoding/json"
)
-// This file is a port of fuzzycat.verify to Go.
+// This file contains a port of fuzzycat.verify to Go.
type (
// Status represents match strength.
@@ -135,7 +134,7 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
}
// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
-// port of https://git.io/JYgOB from fuzzycat.
+// port of https://git.io/JYgOB from fuzzycat. This is good for "self-match" verification.
func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
var (
rc *ReleaseCluster
@@ -170,14 +169,16 @@ func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
}
// ClusterVerify runs verification process across all pairs, but skips clusters
-// containing more than ten elements.
+// containing more than ten elements. If a cluster has more then 10 elements,
+// it might also signal a too ambiguous title. Beside, we do not want this to
+// be too slow.
func ClusterVerify(p []byte) ([]byte, error) {
return ClusterVerifyMaxClusterSize(p, 10)
}
// RefClusterVerify deserializes a cluster document containing both converted
// references and releases and returns a tabular verification result between
-// one release and all references found. This depends on refs and releases
+// one (any) release and all references found. This depends on refs and releases
// being distinguishable, (e.g. via .extra.skate.status == "ref").
func RefClusterVerify(p []byte) ([]byte, error) {
var (
@@ -208,7 +209,8 @@ func RefClusterVerify(p []byte) ([]byte, error) {
return buf.Bytes(), nil
}
-// RefClusterToBiblioRef creates a BiblioRef schema from exact and strong matches only.
+// RefClusterToBiblioRef runs verification and creates a BiblioRef schema from
+// exact and strong matches only.
func RefClusterToBiblioRef(p []byte) ([]byte, error) {
var (
rc *ReleaseCluster
@@ -236,13 +238,14 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {
br = generateBiblioRef(re, pivot, result, "fuzzy")
return JsonMarshalNewline(br)
default:
+ // XXX: may want to include non matches here.
continue
}
}
return buf.Bytes(), nil
}
-// generateBiblioRef generates a bibliographic schema document.
+// generateBiblioRef generates a bibliographic schema document. XXX: This is a bit odd.
func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
var bref BiblioRef
bref.SourceReleaseIdent = source.Ident
@@ -261,165 +264,6 @@ func generateBiblioRef(source, target *Release, matchResult MatchResult, provena
return &bref
}
-// makeKeyFunc creates a function that can be used as keyFunc, selecting a
-// column from sep.
-func makeKeyFunc(sep string, column int) func(string) (string, error) {
- return func(s string) (string, error) {
- if k := lineColumn(s, "\t", 2); k == "" {
- return k, fmt.Errorf("cannot get key: %s", s)
- } else {
- return k, nil
- }
- }
-}
-
-// ZipUnverified takes a release and refs reader (tsv, with ident, key, doc)
-// and assigns a fixed match result.
-func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error {
- var (
- enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
- grouper = func(g *zipkey.Group) error {
- if len(g.G0) == 0 || len(g.G1) == 0 {
- return nil
- }
- target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
- if err != nil {
- return err
- }
- for _, line := range g.G1 {
- ref, err := stringToRef(lineColumn(line, "\t", 3))
- if err != nil {
- return err
- }
- var bref BiblioRef
- bref.SourceReleaseIdent = ref.ReleaseIdent
- bref.SourceWorkIdent = ref.WorkIdent
- bref.SourceReleaseStage = ref.ReleaseStage
- bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
- bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
- bref.RefKey = ref.Key
- bref.TargetReleaseIdent = target.Ident
- bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = mr.Status.Short()
- bref.MatchReason = mr.Reason.Short()
- if err := enc.Encode(bref); err != nil {
- return err
- }
- }
- return nil
- }
- )
- zipper := zipkey.New(releases, refs, keyer, grouper)
- return zipper.Run()
-}
-
-// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc)
-// and assigns a fixed match result.
-func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error {
- var (
- enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
- grouper = func(g *zipkey.Group) error {
- if len(g.G0) == 0 || len(g.G1) == 0 {
- return nil
- }
- target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
- if err != nil {
- return err
- }
- for _, line := range g.G1 {
- wiki, err := stringToWiki(lineColumn(line, "\t", 3))
- if err != nil {
- return err
- }
- var bref BiblioRef
- bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
- bref.SourceWikipediaArticle = wiki.PageTitle
- bref.TargetReleaseIdent = target.Ident
- bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = mr.Status.Short()
- bref.MatchReason = mr.Reason.Short()
- if err := enc.Encode(bref); err != nil {
- return err
- }
- }
- return nil
- }
- )
- zipper := zipkey.New(releases, wiki, keyer, grouper)
- return zipper.Run()
-}
-
-// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
-// and will execute gf for each group found.
-func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error {
- // Define a grouper, working on one set of refs and releases with the same
- // key at a time. Here, we do verification and write out the generated
- // biblioref.
- var (
- enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
- grouper = func(g *zipkey.Group) error {
- if len(g.G0) == 0 || len(g.G1) == 0 {
- return nil
- }
- pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
- if err != nil {
- return err
- }
- for _, line := range g.G1 {
- re, err := stringToRelease(lineColumn(line, "\t", 3))
- if err != nil {
- return err
- }
- result := Verify(pivot, re)
- switch result.Status {
- case StatusExact, StatusStrong:
- if result.Reason == ReasonDOI {
- continue
- }
- br := generateBiblioRef(re, pivot, result, "fuzzy")
- if err := enc.Encode(br); err != nil {
- return err
- }
- }
- }
- return nil
- }
- )
- zipper := zipkey.New(releases, refs, keyer, grouper)
- return zipper.Run()
-}
-
-// lineColumn returns a specific column (1-indexed, like cut) from a tabular
-// file, returns empty string if column is invalid.
-func lineColumn(line, sep string, column int) string {
- parts := strings.Split(strings.TrimSpace(line), sep)
- if len(parts) < column {
- return ""
- } else {
- return parts[column-1]
- }
-}
-
-func stringToRelease(s string) (r *Release, err error) {
- err = json.Unmarshal([]byte(s), &r)
- return
-}
-
-func stringToRef(s string) (r *Ref, err error) {
- err = json.Unmarshal([]byte(s), &r)
- return
-}
-
-func stringToWiki(s string) (r *MinimalCitations, err error) {
- err = json.Unmarshal([]byte(s), &r)
- return
-}
-
// Verify verifies two releases and will ignore short titles.
func Verify(a, b *Release) MatchResult {
return VerifyMinTitleLength(a, b, 5)
diff --git a/skate/zippy.go b/skate/zippy.go
new file mode 100644
index 0000000..76f576d
--- /dev/null
+++ b/skate/zippy.go
@@ -0,0 +1,171 @@
+package skate
+
+import (
+ "fmt"
+ "io"
+ "strings"
+
+ "git.archive.org/martin/cgraph/skate/zipkey"
+ json "github.com/segmentio/encoding/json"
+)
+
+// This file contains the two-stream (zippy) matchers.
+
+// ZippyFixed takes a release and refs reader (tsv, with ident, key, doc)
+// and assigns a fixed match result.
+func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+ var (
+ enc = json.NewEncoder(w)
+ keyer = makeKeyFunc("\t", 2)
+ grouper = func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ ref, err := stringToRef(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.SourceReleaseIdent = ref.ReleaseIdent
+ bref.SourceWorkIdent = ref.WorkIdent
+ bref.SourceReleaseStage = ref.ReleaseStage
+ bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
+ bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
+ bref.RefKey = ref.Key
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = mr.Status.Short()
+ bref.MatchReason = mr.Reason.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+ )
+ zipper := zipkey.New(releases, refs, keyer, grouper)
+ return zipper.Run()
+}
+
+// ZippyFixedWiki takes a release and wiki reader (tsv, with ident, key, doc)
+// and assigns a fixed match result.
+func ZippyFixedWiki(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+ var (
+ enc = json.NewEncoder(w)
+ keyer = makeKeyFunc("\t", 2)
+ grouper = func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ wiki, err := stringToWiki(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
+ bref.SourceWikipediaArticle = wiki.PageTitle
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = mr.Status.Short()
+ bref.MatchReason = mr.Reason.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+ )
+ zipper := zipkey.New(releases, wiki, keyer, grouper)
+ return zipper.Run()
+}
+
+// ZippyVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
+// and will execute gf for each group found.
+func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ var (
+ enc = json.NewEncoder(w)
+ keyer = makeKeyFunc("\t", 2)
+ grouper = func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ pivot, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ re, err := stringToRelease(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ result := Verify(pivot, re)
+ switch result.Status {
+ case StatusExact, StatusStrong:
+ if result.Reason == ReasonDOI {
+ continue
+ }
+ br := generateBiblioRef(re, pivot, result, "fuzzy")
+ if err := enc.Encode(br); err != nil {
+ return err
+ }
+ }
+ }
+ return nil
+ }
+ )
+ zipper := zipkey.New(releases, refs, keyer, grouper)
+ return zipper.Run()
+}
+
+// makeKeyFunc creates a function that can be used as keyFunc, selecting a
+// column from sep.
+func makeKeyFunc(sep string, column int) func(string) (string, error) {
+ return func(s string) (string, error) {
+ if k := lineColumn(s, "\t", 2); k == "" {
+ return k, fmt.Errorf("cannot get key: %s", s)
+ } else {
+ return k, nil
+ }
+ }
+}
+
+// lineColumn returns a specific column (1-indexed, like cut) from a tabular
+// file, returns empty string if column is invalid.
+func lineColumn(line, sep string, column int) string {
+ parts := strings.Split(strings.TrimSpace(line), sep)
+ if len(parts) < column {
+ return ""
+ } else {
+ return parts[column-1]
+ }
+}
+
+func stringToRelease(s string) (r *Release, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
+func stringToRef(s string) (r *Ref, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
+func stringToWiki(s string) (r *MinimalCitations, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}