diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-07-10 01:46:59 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-07-10 01:46:59 +0200 |
commit | 738c7e99035b0548f6a8ea3082f03eeb1e36dc98 (patch) | |
tree | b2c79d08c0e6a6c3b4152d64566970f0fc09fc69 /skate | |
parent | 9095e6089b9f679c4d84be0613224b4edd02f0f4 (diff) | |
download | refcat-738c7e99035b0548f6a8ea3082f03eeb1e36dc98.tar.gz refcat-738c7e99035b0548f6a8ea3082f03eeb1e36dc98.zip |
reduce: open library id tweaks
Diffstat (limited to 'skate')
-rw-r--r-- | skate/reduce.go | 32 |
1 files changed, 27 insertions, 5 deletions
diff --git a/skate/reduce.go b/skate/reduce.go index 823255c..2a8ac1c 100644 --- a/skate/reduce.go +++ b/skate/reduce.go @@ -13,7 +13,7 @@ // TODO: // * [ ] pass release stage through all match types // * [ ] switch to faster logging, e.g. zerolog, https://github.com/rs/zerolog#benchmarks -// * [ ] batch, parallelize +// * [x] batch, parallelize // * [ ] unify flags to "-a", "-b" package skate @@ -225,6 +225,7 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { if result.Reason == ReasonDOI { continue } + // XXX: what should be the provenance? br := generateBiblioRef(re, pivot, result, "fuzzy") if err := enc.Encode(br); err != nil { return err @@ -242,7 +243,8 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { } // ZippyVerifyRefsOpenLibraryTable takes OL editions (as release) and refs (as -// release) and emits a match table for manual inspection. +// release) and emits a match table for manual inspection. This is mainly for +// debugging. func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error { var ( keyer = makeKeyFunc("\t", 1) @@ -287,8 +289,24 @@ func ZippyVerifyRefsOpenLibraryTable(olr, refs io.Reader, w io.Writer) error { // release) and writes biblioref. func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { var ( - enc = json.NewEncoder(xio.NewSingleWriter(w)) - keyer = makeKeyFunc("\t", 1) + enc = json.NewEncoder(xio.NewSingleWriter(w)) + keyer = makeKeyFunc("\t", 1) + cleanIdentifier = func(s string) string { + // Turn ids like /books/OL31189321M into OL31189321M + s = strings.TrimSpace(s) + if len(s) == 0 { + return "" + } + var ( + parts = strings.Split(s, "/") + last = parts[len(parts)-1] + ) + if strings.HasPrefix(last, "OL") { + return last + } + log.Printf("warning: unexpected OL id: %s", s) + return "" + } grouper = func(g *zipkey.Group) error { var ( ref, pivot *Release // ref (reference), pivot (open library) @@ -311,6 +329,10 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { result := Verify(pivot, ref) switch result.Status { case StatusExact, StatusStrong: + openLibraryWorkID := cleanIdentifier(pivot.WorkID) + if openLibraryWorkID == "" { + continue + } var bref BiblioRef bref.SourceReleaseIdent = ref.Ident bref.SourceWorkIdent = ref.WorkID @@ -318,7 +340,7 @@ func ZippyVerifyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { bref.SourceYear = fmt.Sprintf("%d", ref.ReleaseYear()) bref.RefIndex = ref.Extra.Skate.Ref.Index + 1 // we want 1-index (also helps with omitempty) bref.RefKey = ref.Extra.Skate.Ref.Key - bref.TargetOpenLibraryWork = pivot.WorkID + bref.TargetOpenLibraryWork = openLibraryWorkID bref.MatchProvenance = ref.Extra.Skate.Ref.Source bref.MatchStatus = result.Status.Short() bref.MatchReason = result.Reason.Short() |