aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-07-20 11:07:45 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-07-20 11:09:16 +0200
commit16898dedf28a3db7ecebf76f377a3d0d8cd45c7a (patch)
tree0c642dd1ba66cd72af32cdcbe3df058005d38b15 /skate
parentb887b7743d8ef2edc2e5daef67ef96c10596a5c8 (diff)
downloadrefcat-16898dedf28a3db7ecebf76f377a3d0d8cd45c7a.tar.gz
refcat-16898dedf28a3db7ecebf76f377a3d0d8cd45c7a.zip
reduce: use fixed length sha1 for url id part
base32 would occassionally exceed elasticsearch id field limit ("must be no longer than 512 bytes but was: 649")
Diffstat (limited to 'skate')
-rw-r--r--skate/reduce.go8
1 files changed, 5 insertions, 3 deletions
diff --git a/skate/reduce.go b/skate/reduce.go
index e8282d8..c96d17e 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -20,6 +20,7 @@
package skate
import (
+ "crypto/sha1"
"encoding/base32"
"fmt"
"io"
@@ -364,13 +365,13 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
func ZippyWayback(refs, cdx io.Reader, w io.Writer) error {
var (
enc = json.NewEncoder(xio.NewSingleWriter(w))
- b32enc = base32.StdEncoding.WithPadding(base32.NoPadding)
keyer = makeKeyFunc("\t", 1)
grouper = func(g *zipkey.Group) error {
var (
ref *Ref
cdx *cdxSummary
err error
+ h = sha1.New()
)
// We take a single item from refs.
if ref, err = parseRef(Cut(g.G0[0], 2)); err != nil {
@@ -381,8 +382,9 @@ func ZippyWayback(refs, cdx io.Reader, w io.Writer) error {
}
var bref BiblioRef
// TODO: this is a temporary way to generate an id.
- encodedURL := strings.ToLower(b32enc.EncodeToString([]byte(cdx.Line)))
- bref.Key = fmt.Sprintf("web_%s_%s", ref.ReleaseIdent, encodedURL)
+ _, _ = h.Write([]byte(cdx.Line))
+ hashedURL := fmt.Sprintf("%x", h.Sum(nil))
+ bref.Key = fmt.Sprintf("web_%s_%s", ref.ReleaseIdent, hashedURL)
bref.IndexedTs = timeNow().UTC().Format(time.RFC3339)
bref.SourceReleaseIdent = ref.ReleaseIdent
bref.SourceWorkIdent = ref.WorkIdent