diff options
-rw-r--r-- | skate/zippy.go | 53 |
1 files changed, 27 insertions, 26 deletions
diff --git a/skate/zippy.go b/skate/zippy.go index e7677b9..b69ce2b 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -54,12 +54,12 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) if len(g.G0) == 0 || len(g.G1) == 0 { return nil } - if target, err = stringToRelease(Cut(g.G0[0], 2)); err != nil { + if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil { groupLogf(g, "[skip] failed to parse release: %v", err) return nil } for _, line := range g.G1 { - if ref, err = stringToRef(Cut(line, 2)); err != nil { + if ref, err = parseRef(Cut(line, 2)); err != nil { groupLogf(g, "[skip] failed to parse ref: %v", err) continue } @@ -87,8 +87,8 @@ func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) } // ZippyExactReleases takes two release readers (key, doc) and assigns a fixed -// match result. -func ZippyExactReleases(olReader, reReader io.Reader, matchResult MatchResult, w io.Writer) error { +// match result, e.g. used with release entities converted from open library snapshots. +func ZippyExactReleases(olr, releases io.Reader, matchResult MatchResult, w io.Writer) error { var ( enc = json.NewEncoder(w) keyer = makeKeyFunc("\t", 1) @@ -105,12 +105,12 @@ func ZippyExactReleases(olReader, reReader io.Reader, matchResult MatchResult, w if len(g.G0) == 0 || len(g.G1) == 0 { return nil } - if target, err = stringToRelease(Cut(g.G0[0], 2)); err != nil { + if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil { groupLogf(g, "[skip] failed to parse release: %v", err) return nil } for _, line := range g.G1 { - if re, err = stringToRelease(Cut(line, 2)); err != nil { + if re, err = parseRelease(Cut(line, 2)); err != nil { groupLogf(g, "[skip] failed to parse release: %v", err) continue } @@ -135,7 +135,7 @@ func ZippyExactReleases(olReader, reReader io.Reader, matchResult MatchResult, w return nil } ) - zipper := zipkey.New(olReader, reReader, keyer, grouper) + zipper := zipkey.New(olr, releases, keyer, grouper) return zipper.Run() } @@ -154,11 +154,11 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error if len(g.G0) == 0 || len(g.G1) == 0 { return nil } - if target, err = stringToRelease(Cut(g.G0[0], 2)); err != nil { + if target, err = parseRelease(Cut(g.G0[0], 2)); err != nil { return err } for _, line := range g.G1 { - if wiki, err = stringToWiki(Cut(line, 2)); err != nil { + if wiki, err = parseWiki(Cut(line, 2)); err != nil { return err } var bref BiblioRef @@ -180,8 +180,9 @@ func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error return zipper.Run() } -// ZippyVerifyRefs takes a release and refs reader (key, doc), run fuzzy -// verification and will emit a biblioref document, if exact or strong match. +// ZippyVerifyRefs takes a release and refs (as release) reader (key, doc), run +// fuzzy verification and will emit a biblioref document, if exact or strong +// match. func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { var ( enc = json.NewEncoder(w) @@ -194,11 +195,11 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { if len(g.G0) == 0 || len(g.G1) == 0 { return nil } - if pivot, err = stringToRelease(Cut(g.G0[0], 2)); err != nil { + if pivot, err = parseRelease(Cut(g.G0[0], 2)); err != nil { return err } for _, line := range g.G1 { - if re, err = stringToRelease(Cut(line, 2)); err != nil { + if re, err = parseRelease(Cut(line, 2)); err != nil { return err } result := Verify(pivot, re) @@ -212,9 +213,6 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { return err } default: - // XXX: We want to add unmatched pieces as well; here? We - // probably want to do a single final pass to complete the - // dataset. } } return nil @@ -238,11 +236,11 @@ func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error { return nil } // We take a single edition from OL. - if pivot, err = stringToRelease(Cut(g.G0[0], 2)); err != nil { + if pivot, err = parseRelease(Cut(g.G0[0], 2)); err != nil { return err } for _, line := range g.G1 { - if re, err = stringToRelease(Cut(line, 2)); err != nil { + if re, err = parseRelease(Cut(line, 2)); err != nil { return err } // The refs have a container name, but not a title, but here we @@ -267,7 +265,7 @@ func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error { } // ZippyVerifyRefsOpenLibrary takes OL editions (as release) and refs (as -// release) and emits a match table for manual inspection. +// release) and writes biblioref. func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { var ( enc = json.NewEncoder(w) @@ -281,11 +279,11 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { return nil } // We take a single edition from OL. - if pivot, err = stringToRelease(Cut(g.G0[0], 2)); err != nil { + if pivot, err = parseRelease(Cut(g.G0[0], 2)); err != nil { return err } for _, line := range g.G1 { - if ref, err = stringToRelease(Cut(line, 2)); err != nil { + if ref, err = parseRelease(Cut(line, 2)); err != nil { return err } // The refs have a container name, but not a title, but here we @@ -320,11 +318,14 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { // ZippyBrefAugment takes all matched docs from bref and adds docs from raw // refs, which have not been matched. It also gets rid of duplicate matches. +// Note: This operates on two streams: raw refs with about 2.5B (07/2021) and +// matches, which will be about 1B; in essence we have to iterate through about +// 3.5B records; small tweak here may be worthwhile. // // We can identify, which docs have been matched by checking the source ident, // ref index and key. // -// TODO: This needs to be completed. +// TODO: This needs to be completed and made fast. func ZippyBrefAugment(bref, raw io.Reader, w io.Writer) error { var ( stats = statsAugment{} @@ -557,22 +558,22 @@ func makeKeyFunc(sep string, column int) func(string) (string, error) { } } -func stringToRelease(s string) (r *Release, err error) { +func parseRelease(s string) (r *Release, err error) { err = json.Unmarshal([]byte(s), &r) return } -func stringToRef(s string) (r *Ref, err error) { +func parseRef(s string) (r *Ref, err error) { err = json.Unmarshal([]byte(s), &r) return } -func stringToWiki(s string) (r *MinimalCitations, err error) { +func parseWiki(s string) (r *MinimalCitations, err error) { err = json.Unmarshal([]byte(s), &r) return } -func stringToBiblioref(s string) (r *BiblioRef, err error) { +func parseBiblioref(s string) (r *BiblioRef, err error) { err = json.Unmarshal([]byte(s), &r) return } |