aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-15 15:55:34 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-15 15:55:34 +0200
commit97a1a7b547ae3d0f6611ec1c5159a5447759e6dc (patch)
tree27560f1ef53bf9eea0743b61e5d0ce0fc5a3c4d6 /skate
parent63922f9e6a3d1ab0be13d3ae2fd4eb9c55069611 (diff)
downloadrefcat-97a1a7b547ae3d0f6611ec1c5159a5447759e6dc.tar.gz
refcat-97a1a7b547ae3d0f6611ec1c5159a5447759e6dc.zip
zippy: unique link id for different link types
Diffstat (limited to 'skate')
-rw-r--r--skate/schema.go31
-rw-r--r--skate/schema_test.go43
-rw-r--r--skate/zippy.go12
3 files changed, 78 insertions, 8 deletions
diff --git a/skate/schema.go b/skate/schema.go
index 10504fe..6b335a3 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -1,6 +1,9 @@
package skate
import (
+ "bytes"
+ "crypto/sha1"
+ "encoding/gob"
"fmt"
"regexp"
"strconv"
@@ -350,6 +353,34 @@ type BiblioRef struct {
// } `json:"extra,omitempty"`
}
+// Hash returns a string that will be the same, if source and target are
+// equals; different otherwise.
+func (b *BiblioRef) LinkHash() string {
+ switch {
+ case b.SourceReleaseIdent != "" && b.TargetReleaseIdent != "":
+ return fmt.Sprintf("fc:%s--fc:%s", b.SourceReleaseIdent, b.TargetReleaseIdent)
+ case b.SourceReleaseIdent != "" && b.TargetOpenLibraryWork != "":
+ return fmt.Sprintf("fc:%s--ol:%s", b.SourceReleaseIdent, b.TargetOpenLibraryWork)
+ case b.SourceReleaseIdent != "" && b.TargetURL != "":
+ return fmt.Sprintf("fc:%s--wb:%s", b.SourceReleaseIdent, b.TargetURL)
+ case b.SourceReleaseIdent != "" && b.TargetURLSurt != "":
+ return fmt.Sprintf("fc:%s--wb:%s", b.SourceReleaseIdent, b.TargetURLSurt)
+ default:
+ var (
+ buf bytes.Buffer
+ enc = gob.NewEncoder(&buf)
+ h = sha1.New()
+ )
+ if err := enc.Encode(b); err != nil {
+ return ""
+ }
+ if _, err := buf.WriteTo(h); err != nil {
+ return ""
+ }
+ return fmt.Sprintf("%x", h.Sum(nil))
+ }
+}
+
// ReleaseCluster, a list of match candidates. This is typically serialized as a
// single JSON line containing the match key and a list of release documents.
//
diff --git a/skate/schema_test.go b/skate/schema_test.go
index 6cf4559..1301184 100644
--- a/skate/schema_test.go
+++ b/skate/schema_test.go
@@ -213,6 +213,49 @@ func TestParseIsbn(t *testing.T) {
}
}
+func TestLinkHash(t *testing.T) {
+ var cases = []struct {
+ bref BiblioRef
+ linkHash string
+ }{
+ {
+ bref: BiblioRef{},
+ linkHash: "e39b4dd927dbfb19c8aa78b10f684b0a5c35fcca",
+ },
+ {
+ bref: BiblioRef{SourceReleaseIdent: "123"},
+ linkHash: "6d244189489b57c8f70ac7a835f1bacfee00c42a",
+ },
+ {
+ bref: BiblioRef{
+ SourceReleaseIdent: "123",
+ TargetReleaseIdent: "456",
+ },
+ linkHash: "fc:123--fc:456",
+ },
+ {
+ bref: BiblioRef{
+ SourceReleaseIdent: "123",
+ TargetOpenLibraryWork: "/works/OL456M",
+ },
+ linkHash: "fc:123--ol:/works/OL456M",
+ },
+ {
+ bref: BiblioRef{
+ SourceReleaseIdent: "123",
+ TargetURL: "http://fatcat.wiki",
+ },
+ linkHash: "fc:123--wb:http://fatcat.wiki",
+ },
+ }
+ for _, c := range cases {
+ result := c.bref.LinkHash()
+ if result != c.linkHash {
+ t.Fatalf("got %v, want %v", result, c.linkHash)
+ }
+ }
+}
+
func BenchmarkParseIsbn(b *testing.B) {
for n := 0; n < b.N; n++ {
ParseIsbn("House Pvt. Limited., (2006), ISBN 9788183561426. Date accessed: August 2015.")
diff --git a/skate/zippy.go b/skate/zippy.go
index a6c5083..d71d57e 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -411,20 +411,16 @@ func uniqueMatches(docs []string, stats *statsAugment) (result []*BiblioRef, err
sort.Slice(brefs, func(i, j int) bool {
return brefs[i].MatchStatus == StatusExact.Short()
})
- // We consider a match unique, if source and target match.
- hash := func(bref *BiblioRef) string {
- return bref.SourceReleaseIdent + "-" + bref.TargetReleaseIdent
- }
seen := set.New()
for _, doc := range brefs {
- v := hash(doc)
- if seen.Contains(v) {
+ h := doc.LinkHash()
+ if seen.Contains(h) {
stats.skipDuplicatedBref++
log.Printf("skip-dup-bref [%d]: hash=%v source=%v status=%v reason=%v",
- stats.skipDuplicatedBref, v, doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason)
+ stats.skipDuplicatedBref, h, doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason)
continue
}
- seen.Add(v)
+ seen.Add(h)
result = append(result, doc)
}
return result, nil