update docs

author: Martin Czygan <martin.czygan@gmail.com> 2021-05-05 16:26:29 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-05-05 16:26:29 +0200
commit: 0e26ef6b1b7998198bc92ac9890f6fe42c86a45f (patch)
tree: 6fa1dd861199f8d650f81034317fea4194a23778 /skate
parent: af1283dfcc2f56d4ac53dda619583f93fe6dfe87 (diff)
download: refcat-0e26ef6b1b7998198bc92ac9890f6fe42c86a45f.tar.gz
refcat-0e26ef6b1b7998198bc92ac9890f6fe42c86a45f.zip
2 files changed, 15 insertions, 10 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go
index 34e79a3..4a33473 100644
--- a/skate/cmd/skate-conv/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -36,7 +36,7 @@ func main() {
 	case "ol":
 		f = openLibraryToRelease
 	default:
-		log.Fatal("unsupported input schema: %v", *fromFormat)
+		log.Fatalf("unsupported input schema: %v", *fromFormat)
 	}
 	pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
 	pp.NumWorkers = *numWorkers
diff --git a/skate/verify.go b/skate/verify.go
index d1f98f0..5367ffe 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -134,9 +134,9 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
 	return b, nil
 }
 
-// ClusterVerifyMaxSize runs verification across all pairs in the cluster. This is a
+// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
 // port of https://git.io/JYgOB from fuzzycat.
-func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) {
+func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
 	var (
 		rc  *ReleaseCluster
 		buf bytes.Buffer
@@ -172,7 +172,7 @@ func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) {
 // ClusterVerify runs verification process across all pairs, but skips clusters
 // containing more than ten elements.
 func ClusterVerify(p []byte) ([]byte, error) {
-	return ClusterVerifyMaxSize(p, 10)
+	return ClusterVerifyMaxClusterSize(p, 10)
 }
 
 // RefClusterVerify deserializes a cluster document containing both converted
@@ -243,8 +243,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {
 }
 
 // generateBiblioRef generates a bibliographic schema document.
-func generateBiblioRef(source, target *Release,
-	matchResult MatchResult, provenance string) *BiblioRef {
+func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
 	var bref BiblioRef
 	bref.SourceReleaseIdent = source.Ident
 	bref.SourceWorkIdent = source.WorkID
@@ -736,7 +735,8 @@ func parsePageString(s string) *ParsedPages {
 
 // averageScore take a limited set of authors and calculates pairwise
 // similarity scores, then returns the average of the best scores; between 0
-// and 1.
+// and 1. XXX: This should be revisited and factored out; reading:
+// https://github.com/djudd/human-name.
 func averageScore(a, b set.Set) float64 {
 	aTrimmed := a.TopK(5)
 	bTrimmed := b.TopK(5)
@@ -763,8 +763,8 @@ func authorSimilarityScore(s, t string) float64 {
 	return ss.Jaccard(ts)
 }
 
-// tokenNgrams are groups of n tokens per token in string, e.g. for n=2 and
-// string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"].
+// tokenNgrams are groups of n char-tokens per word-token in string, e.g. for
+// n=2 and string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"].
 func tokenNgrams(s string, n int) (result []string) {
 	var buf bytes.Buffer
 	for _, token := range tokenizeString(s) {
@@ -790,7 +790,12 @@ func doiPrefix(s string) string {
 	return parts[0]
 }
 
-// unifyDigits replaces all digit groups with a placeholder, e.g. "<NUM>".
+// unifyDigits replaces all digit groups with a hopefully rare placeholder,
+// e.g. "<NUM>"; This is for discovering very similar, yet different
+// publications, where e.g. titles differ only by a single char representing a
+// year. Examples are yearly publications, e.g. "World Health Report 2020",
+// where any plain similarity score would yield a high number, yet publications
+// are obviously not the same.
 func unifyDigits(s string) string {
 	return PatDigits.ReplaceAllString(s, "<NUM>")
 }
author	Martin Czygan <martin.czygan@gmail.com>	2021-05-05 16:26:29 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-05-05 16:26:29 +0200
commit	0e26ef6b1b7998198bc92ac9890f6fe42c86a45f (patch)
tree	6fa1dd861199f8d650f81034317fea4194a23778 /skate
parent	af1283dfcc2f56d4ac53dda619583f93fe6dfe87 (diff)
download	refcat-0e26ef6b1b7998198bc92ac9890f6fe42c86a45f.tar.gz refcat-0e26ef6b1b7998198bc92ac9890f6fe42c86a45f.zip