From 738c7e99035b0548f6a8ea3082f03eeb1e36dc98 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 10 Jul 2021 01:46:59 +0200 Subject: reduce: open library id tweaks --- skate/reduce.go | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'skate') diff --git a/skate/reduce.go b/skate/reduce.go index 823255c..2a8ac1c 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -13,7 +13,7 @@ // TODO: // * [ ] pass release stage through all match types // * [ ] switch to faster logging, e.g. zerolog, https://github.com/rs/zerolog#benchmarks -// * [ ] batch, parallelize +// * [x] batch, parallelize // * [ ] unify flags to "-a", "-b" package skate @@ -225,6 +225,7 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { if result.Reason == ReasonDOI { continue } + // XXX: what should be the provenance? br := generateBiblioRef(re, pivot, result, "fuzzy") if err := enc.Encode(br); err != nil { return err @@ -242,7 +243,8 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { } // ZippyVerifyRefsOpenLibraryTable takes OL editions (as release) and refs (as -// release) and emits a match table for manual inspection. +// release) and emits a match table for manual inspection. This is mainly for +// debugging. func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error { var ( keyer = makeKeyFunc("\t", 1) @@ -287,8 +289,24 @@ func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error { // release) and writes biblioref. func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { var ( - enc = json.NewEncoder(xio.NewSingleWriter(w)) - keyer = makeKeyFunc("\t", 1) + enc = json.NewEncoder(xio.NewSingleWriter(w)) + keyer = makeKeyFunc("\t", 1) + cleanIdentifier = func(s string) string { + // Turn ids like /books/OL31189321M into OL31189321M + s = strings.TrimSpace(s) + if len(s) == 0 { + return "" + } + var ( + parts = strings.Split(s, "/") + last = parts[len(parts)-1] + ) + if strings.HasPrefix(last, "OL") { + return last + } + log.Printf("warning: unexpected OL id: %s", s) + return "" + } grouper = func(g *zipkey.Group) error { var ( ref, pivot *Release // ref (reference), pivot (open library) @@ -311,6 +329,10 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { result := Verify(pivot, ref) switch result.Status { case StatusExact, StatusStrong: + openLibraryWorkID := cleanIdentifier(pivot.WorkID) + if openLibraryWorkID == "" { + continue + } var bref BiblioRef bref.SourceReleaseIdent = ref.Ident bref.SourceWorkIdent = ref.WorkID @@ -318,7 +340,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear()) bref.RefIndex = ref.Extra.Skate.Ref.Index + 1 // we want 1-index (also helps with omitempty) bref.RefKey = ref.Extra.Skate.Ref.Key - bref.TargetOpenLibraryWork = pivot.WorkID + bref.TargetOpenLibraryWork = openLibraryWorkID bref.MatchProvenance = ref.Extra.Skate.Ref.Source bref.MatchStatus = result.Status.Short() bref.MatchReason = result.Reason.Short() -- cgit v1.2.3