aboutsummaryrefslogtreecommitdiffstats
path: root/skate/reduce.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/reduce.go')
-rw-r--r--skate/reduce.go60
1 files changed, 59 insertions, 1 deletions
diff --git a/skate/reduce.go b/skate/reduce.go
index df96076..8658ffe 100644
--- a/skate/reduce.go
+++ b/skate/reduce.go
@@ -359,6 +359,59 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
return zipper.Run()
}
+// ZippyWayback takes a (url, ref) reader and a (url, cdx) reader and will
+// write a bref document for each match.
+func ZippyWayback(refs, cdx io.Reader, w io.Writer) error {
+ var (
+ enc = json.NewEncoder(xio.NewSingleWriter(w))
+ keyer = makeKeyFunc("\t", 1)
+ grouper = func(g *zipkey.Group) error {
+ var (
+ ref *Ref
+ cdx *cdxSummary
+ err error
+ )
+ // We take a single item from refs.
+ if ref, err = parseRef(Cut(g.G0[0], 2)); err != nil {
+ return err
+ }
+ if cdx, err = parseCdxSummary(Cut(g.G1[0], 2)); err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.SourceReleaseIdent = ref.ReleaseIdent
+ bref.SourceWorkIdent = ref.WorkIdent
+ bref.SourceReleaseStage = ref.ReleaseStage
+ bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear)
+ bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty)
+ bref.RefKey = ref.Key
+ if cdx.NumRows == 0 {
+ bref.TargetURL = cdx.Line
+ } else {
+ if cdx.Summary.Ok == "" {
+ bref.TargetURL = cdx.Line
+ } else {
+ // TODO: This would be better, if we only add a wayback
+ // link, if live web fails. For that we would need a full
+ // check of the URLs on the live web.
+ bref.TargetURL = fmt.Sprintf("https://web.archive.org/web/%s/%s",
+ cdx.Summary.Ok, cdx.Line)
+ }
+ }
+ bref.MatchStatus = StatusExact.Short()
+ bref.MatchReason = ReasonURLMatch.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ return nil
+ }
+ batcher = zipkey.NewBatcher(grouper)
+ )
+ defer batcher.Close()
+ zipper := zipkey.New(refs, cdx, keyer, batcher.GroupFunc)
+ return zipper.Run()
+}
+
// ZippyBrefAugment takes all matched docs from bref and adds docs from raw
// refs, which have not been matched. It also gets rid of duplicate matches.
// Note: This operates on two streams: raw refs with about 2.5B (07/2021) and
@@ -661,7 +714,12 @@ func parseWiki(s string) (r *MinimalCitations, err error) {
return
}
-func parseBiblioref(s string) (r *BiblioRef, err error) {
+func parseBiblioRef(s string) (r *BiblioRef, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
+func parseCdxSummary(s string) (r *cdxSummary, err error) {
err = json.Unmarshal([]byte(s), &r)
return
}