diff options
-rw-r--r-- | skate/cmd/skate-conv/main.go | 2 | ||||
-rw-r--r-- | skate/verify.go | 23 |
2 files changed, 15 insertions, 10 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go index 34e79a3..4a33473 100644 --- a/skate/cmd/skate-conv/main.go +++ b/skate/cmd/skate-conv/main.go @@ -36,7 +36,7 @@ func main() { case "ol": f = openLibraryToRelease default: - log.Fatal("unsupported input schema: %v", *fromFormat) + log.Fatalf("unsupported input schema: %v", *fromFormat) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) pp.NumWorkers = *numWorkers diff --git a/skate/verify.go b/skate/verify.go index d1f98f0..5367ffe 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -134,9 +134,9 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) { return b, nil } -// ClusterVerifyMaxSize runs verification across all pairs in the cluster. This is a +// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a // port of https://git.io/JYgOB from fuzzycat. -func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) { +func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) { var ( rc *ReleaseCluster buf bytes.Buffer @@ -172,7 +172,7 @@ func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) { // ClusterVerify runs verification process across all pairs, but skips clusters // containing more than ten elements. func ClusterVerify(p []byte) ([]byte, error) { - return ClusterVerifyMaxSize(p, 10) + return ClusterVerifyMaxClusterSize(p, 10) } // RefClusterVerify deserializes a cluster document containing both converted @@ -243,8 +243,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) { } // generateBiblioRef generates a bibliographic schema document. -func generateBiblioRef(source, target *Release, - matchResult MatchResult, provenance string) *BiblioRef { +func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef { var bref BiblioRef bref.SourceReleaseIdent = source.Ident bref.SourceWorkIdent = source.WorkID @@ -736,7 +735,8 @@ func parsePageString(s string) *ParsedPages { // averageScore take a limited set of authors and calculates pairwise // similarity scores, then returns the average of the best scores; between 0 -// and 1. +// and 1. XXX: This should be revisited and factored out; reading: +// https://github.com/djudd/human-name. func averageScore(a, b set.Set) float64 { aTrimmed := a.TopK(5) bTrimmed := b.TopK(5) @@ -763,8 +763,8 @@ func authorSimilarityScore(s, t string) float64 { return ss.Jaccard(ts) } -// tokenNgrams are groups of n tokens per token in string, e.g. for n=2 and -// string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"]. +// tokenNgrams are groups of n char-tokens per word-token in string, e.g. for +// n=2 and string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"]. func tokenNgrams(s string, n int) (result []string) { var buf bytes.Buffer for _, token := range tokenizeString(s) { @@ -790,7 +790,12 @@ func doiPrefix(s string) string { return parts[0] } -// unifyDigits replaces all digit groups with a placeholder, e.g. "<NUM>". +// unifyDigits replaces all digit groups with a hopefully rare placeholder, +// e.g. "<NUM>"; This is for discovering very similar, yet different +// publications, where e.g. titles differ only by a single char representing a +// year. Examples are yearly publications, e.g. "World Health Report 2020", +// where any plain similarity score would yield a high number, yet publications +// are obviously not the same. func unifyDigits(s string) string { return PatDigits.ReplaceAllString(s, "<NUM>") } |