aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-05 16:26:29 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-05 16:26:29 +0200
commit0e26ef6b1b7998198bc92ac9890f6fe42c86a45f (patch)
tree6fa1dd861199f8d650f81034317fea4194a23778 /skate
parentaf1283dfcc2f56d4ac53dda619583f93fe6dfe87 (diff)
downloadrefcat-0e26ef6b1b7998198bc92ac9890f6fe42c86a45f.tar.gz
refcat-0e26ef6b1b7998198bc92ac9890f6fe42c86a45f.zip
update docs
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-conv/main.go2
-rw-r--r--skate/verify.go23
2 files changed, 15 insertions, 10 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go
index 34e79a3..4a33473 100644
--- a/skate/cmd/skate-conv/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -36,7 +36,7 @@ func main() {
case "ol":
f = openLibraryToRelease
default:
- log.Fatal("unsupported input schema: %v", *fromFormat)
+ log.Fatalf("unsupported input schema: %v", *fromFormat)
}
pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
pp.NumWorkers = *numWorkers
diff --git a/skate/verify.go b/skate/verify.go
index d1f98f0..5367ffe 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -134,9 +134,9 @@ func JsonMarshalNewline(v interface{}) ([]byte, error) {
return b, nil
}
-// ClusterVerifyMaxSize runs verification across all pairs in the cluster. This is a
+// ClusterVerifyMaxClusterSize runs verification across all pairs in the cluster. This is a
// port of https://git.io/JYgOB from fuzzycat.
-func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) {
+func ClusterVerifyMaxClusterSize(p []byte, maxClusterSize int) ([]byte, error) {
var (
rc *ReleaseCluster
buf bytes.Buffer
@@ -172,7 +172,7 @@ func ClusterVerifyMaxSize(p []byte, maxClusterSize int) ([]byte, error) {
// ClusterVerify runs verification process across all pairs, but skips clusters
// containing more than ten elements.
func ClusterVerify(p []byte) ([]byte, error) {
- return ClusterVerifyMaxSize(p, 10)
+ return ClusterVerifyMaxClusterSize(p, 10)
}
// RefClusterVerify deserializes a cluster document containing both converted
@@ -243,8 +243,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {
}
// generateBiblioRef generates a bibliographic schema document.
-func generateBiblioRef(source, target *Release,
- matchResult MatchResult, provenance string) *BiblioRef {
+func generateBiblioRef(source, target *Release, matchResult MatchResult, provenance string) *BiblioRef {
var bref BiblioRef
bref.SourceReleaseIdent = source.Ident
bref.SourceWorkIdent = source.WorkID
@@ -736,7 +735,8 @@ func parsePageString(s string) *ParsedPages {
// averageScore take a limited set of authors and calculates pairwise
// similarity scores, then returns the average of the best scores; between 0
-// and 1.
+// and 1. XXX: This should be revisited and factored out; reading:
+// https://github.com/djudd/human-name.
func averageScore(a, b set.Set) float64 {
aTrimmed := a.TopK(5)
bTrimmed := b.TopK(5)
@@ -763,8 +763,8 @@ func authorSimilarityScore(s, t string) float64 {
return ss.Jaccard(ts)
}
-// tokenNgrams are groups of n tokens per token in string, e.g. for n=2 and
-// string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"].
+// tokenNgrams are groups of n char-tokens per word-token in string, e.g. for
+// n=2 and string "Anne K Lam", we would get ["an", "ne", "k", "la", "m"].
func tokenNgrams(s string, n int) (result []string) {
var buf bytes.Buffer
for _, token := range tokenizeString(s) {
@@ -790,7 +790,12 @@ func doiPrefix(s string) string {
return parts[0]
}
-// unifyDigits replaces all digit groups with a placeholder, e.g. "<NUM>".
+// unifyDigits replaces all digit groups with a hopefully rare placeholder,
+// e.g. "<NUM>"; This is for discovering very similar, yet different
+// publications, where e.g. titles differ only by a single char representing a
+// year. Examples are yearly publications, e.g. "World Health Report 2020",
+// where any plain similarity score would yield a high number, yet publications
+// are obviously not the same.
func unifyDigits(s string) string {
return PatDigits.ReplaceAllString(s, "<NUM>")
}