diff options
Diffstat (limited to 'skate/reduce.go')
-rw-r--r-- | skate/reduce.go | 60 |
1 files changed, 59 insertions, 1 deletions
diff --git a/skate/reduce.go b/skate/reduce.go index df96076..8658ffe 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -359,6 +359,59 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { return zipper.Run() } +// ZippyWayback takes a (url, ref) reader and a (url, cdx) reader and will +// write a bref document for each match. +func ZippyWayback(refs, cdx io.Reader, w io.Writer) error { + var ( + enc = json.NewEncoder(xio.NewSingleWriter(w)) + keyer = makeKeyFunc("\t", 1) + grouper = func(g *zipkey.Group) error { + var ( + ref *Ref + cdx *cdxSummary + err error + ) + // We take a single item from refs. + if ref, err = parseRef(Cut(g.G0[0], 2)); err != nil { + return err + } + if cdx, err = parseCdxSummary(Cut(g.G1[0], 2)); err != nil { + return err + } + var bref BiblioRef + bref.SourceReleaseIdent = ref.ReleaseIdent + bref.SourceWorkIdent = ref.WorkIdent + bref.SourceReleaseStage = ref.ReleaseStage + bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear) + bref.RefIndex = ref.Index + 1 // we want 1-index (also helps with omitempty) + bref.RefKey = ref.Key + if cdx.NumRows == 0 { + bref.TargetURL = cdx.Line + } else { + if cdx.Summary.Ok == "" { + bref.TargetURL = cdx.Line + } else { + // TODO: This would be better, if we only add a wayback + // link, if live web fails. For that we would need a full + // check of the URLs on the live web. + bref.TargetURL = fmt.Sprintf("https://web.archive.org/web/%s/%s", + cdx.Summary.Ok, cdx.Line) + } + } + bref.MatchStatus = StatusExact.Short() + bref.MatchReason = ReasonURLMatch.Short() + if err := enc.Encode(bref); err != nil { + return err + } + return nil + } + batcher = zipkey.NewBatcher(grouper) + ) + defer batcher.Close() + zipper := zipkey.New(refs, cdx, keyer, batcher.GroupFunc) + return zipper.Run() +} + // ZippyBrefAugment takes all matched docs from bref and adds docs from raw // refs, which have not been matched. It also gets rid of duplicate matches. // Note: This operates on two streams: raw refs with about 2.5B (07/2021) and @@ -661,7 +714,12 @@ func parseWiki(s string) (r *MinimalCitations, err error) { return } -func parseBiblioref(s string) (r *BiblioRef, err error) { +func parseBiblioRef(s string) (r *BiblioRef, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + +func parseCdxSummary(s string) (r *cdxSummary, err error) { err = json.Unmarshal([]byte(s), &r) return } |