diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-20 11:07:45 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-20 11:09:16 +0200 |
commit | 16898dedf28a3db7ecebf76f377a3d0d8cd45c7a (patch) | |
tree | 0c642dd1ba66cd72af32cdcbe3df058005d38b15 | |
parent | b887b7743d8ef2edc2e5daef67ef96c10596a5c8 (diff) | |
download | refcat-16898dedf28a3db7ecebf76f377a3d0d8cd45c7a.tar.gz refcat-16898dedf28a3db7ecebf76f377a3d0d8cd45c7a.zip |
reduce: use fixed length sha1 for url id part
base32 would occassionally exceed elasticsearch id field limit ("must be
no longer than 512 bytes but was: 649")
-rw-r--r-- | skate/reduce.go | 8 |
1 files changed, 5 insertions, 3 deletions
diff --git a/skate/reduce.go b/skate/reduce.go index e8282d8..c96d17e 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -20,6 +20,7 @@ package skate import ( + "crypto/sha1" "encoding/base32" "fmt" "io" @@ -364,13 +365,13 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { func ZippyWayback(refs, cdx io.Reader, w io.Writer) error { var ( enc = json.NewEncoder(xio.NewSingleWriter(w)) - b32enc = base32.StdEncoding.WithPadding(base32.NoPadding) keyer = makeKeyFunc("\t", 1) grouper = func(g *zipkey.Group) error { var ( ref *Ref cdx *cdxSummary err error + h = sha1.New() ) // We take a single item from refs. if ref, err = parseRef(Cut(g.G0[0], 2)); err != nil { @@ -381,8 +382,9 @@ func ZippyWayback(refs, cdx io.Reader, w io.Writer) error { } var bref BiblioRef // TODO: this is a temporary way to generate an id. - encodedURL := strings.ToLower(b32enc.EncodeToString([]byte(cdx.Line))) - bref.Key = fmt.Sprintf("web_%s_%s", ref.ReleaseIdent, encodedURL) + _, _ = h.Write([]byte(cdx.Line)) + hashedURL := fmt.Sprintf("%x", h.Sum(nil)) + bref.Key = fmt.Sprintf("web_%s_%s", ref.ReleaseIdent, hashedURL) bref.IndexedTs = timeNow().UTC().Format(time.RFC3339) bref.SourceReleaseIdent = ref.ReleaseIdent bref.SourceWorkIdent = ref.WorkIdent |