diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/schema.go | 31 | ||||
-rw-r--r-- | skate/schema_test.go | 43 | ||||
-rw-r--r-- | skate/zippy.go | 12 |
3 files changed, 78 insertions, 8 deletions
diff --git a/skate/schema.go b/skate/schema.go index 10504fe..6b335a3 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -1,6 +1,9 @@ package skate import ( + "bytes" + "crypto/sha1" + "encoding/gob" "fmt" "regexp" "strconv" @@ -350,6 +353,34 @@ type BiblioRef struct { // } `json:"extra,omitempty"` } +// Hash returns a string that will be the same, if source and target are +// equals; different otherwise. +func (b *BiblioRef) LinkHash() string { + switch { + case b.SourceReleaseIdent != "" && b.TargetReleaseIdent != "": + return fmt.Sprintf("fc:%s--fc:%s", b.SourceReleaseIdent, b.TargetReleaseIdent) + case b.SourceReleaseIdent != "" && b.TargetOpenLibraryWork != "": + return fmt.Sprintf("fc:%s--ol:%s", b.SourceReleaseIdent, b.TargetOpenLibraryWork) + case b.SourceReleaseIdent != "" && b.TargetURL != "": + return fmt.Sprintf("fc:%s--wb:%s", b.SourceReleaseIdent, b.TargetURL) + case b.SourceReleaseIdent != "" && b.TargetURLSurt != "": + return fmt.Sprintf("fc:%s--wb:%s", b.SourceReleaseIdent, b.TargetURLSurt) + default: + var ( + buf bytes.Buffer + enc = gob.NewEncoder(&buf) + h = sha1.New() + ) + if err := enc.Encode(b); err != nil { + return "" + } + if _, err := buf.WriteTo(h); err != nil { + return "" + } + return fmt.Sprintf("%x", h.Sum(nil)) + } +} + // ReleaseCluster, a list of match candidates. This is typically serialized as a // single JSON line containing the match key and a list of release documents. // diff --git a/skate/schema_test.go b/skate/schema_test.go index 6cf4559..1301184 100644 --- a/skate/schema_test.go +++ b/skate/schema_test.go @@ -213,6 +213,49 @@ func TestParseIsbn(t *testing.T) { } } +func TestLinkHash(t *testing.T) { + var cases = []struct { + bref BiblioRef + linkHash string + }{ + { + bref: BiblioRef{}, + linkHash: "e39b4dd927dbfb19c8aa78b10f684b0a5c35fcca", + }, + { + bref: BiblioRef{SourceReleaseIdent: "123"}, + linkHash: "6d244189489b57c8f70ac7a835f1bacfee00c42a", + }, + { + bref: BiblioRef{ + SourceReleaseIdent: "123", + TargetReleaseIdent: "456", + }, + linkHash: "fc:123--fc:456", + }, + { + bref: BiblioRef{ + SourceReleaseIdent: "123", + TargetOpenLibraryWork: "/works/OL456M", + }, + linkHash: "fc:123--ol:/works/OL456M", + }, + { + bref: BiblioRef{ + SourceReleaseIdent: "123", + TargetURL: "http://fatcat.wiki", + }, + linkHash: "fc:123--wb:http://fatcat.wiki", + }, + } + for _, c := range cases { + result := c.bref.LinkHash() + if result != c.linkHash { + t.Fatalf("got %v, want %v", result, c.linkHash) + } + } +} + func BenchmarkParseIsbn(b *testing.B) { for n := 0; n < b.N; n++ { ParseIsbn("House Pvt. Limited., (2006), ISBN 9788183561426. Date accessed: August 2015.") diff --git a/skate/zippy.go b/skate/zippy.go index a6c5083..d71d57e 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -411,20 +411,16 @@ func uniqueMatches(docs []string, stats *statsAugment) (result []*BiblioRef, err sort.Slice(brefs, func(i, j int) bool { return brefs[i].MatchStatus == StatusExact.Short() }) - // We consider a match unique, if source and target match. - hash := func(bref *BiblioRef) string { - return bref.SourceReleaseIdent + "-" + bref.TargetReleaseIdent - } seen := set.New() for _, doc := range brefs { - v := hash(doc) - if seen.Contains(v) { + h := doc.LinkHash() + if seen.Contains(h) { stats.skipDuplicatedBref++ log.Printf("skip-dup-bref [%d]: hash=%v source=%v status=%v reason=%v", - stats.skipDuplicatedBref, v, doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason) + stats.skipDuplicatedBref, h, doc.SourceReleaseIdent, doc.MatchStatus, doc.MatchReason) continue } - seen.Add(v) + seen.Add(h) result = append(result, doc) } return result, nil |