From 131595eb60457115dbe9d3bc7c9dedb4253a17ed Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Sat, 22 May 2021 00:40:36 +0200 Subject: wip: ol fuzzy matching --- skate/cmd/skate-reduce/main.go | 20 +++++++++++++++--- skate/zippy.go | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 3 deletions(-) (limited to 'skate') diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index ca87994..19e5fb5 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -34,6 +34,11 @@ // * wiki | zippy mode for releases and wikipedia inputs. // | // | $ skate-reduce -m wiki -L a.ndj -W b.ndj +// | +// | +// * ol | zippy mode for releases and OL inputs. +// | +// | $ skate-reduce -m ol -F a.ndj -O b.ndj // package main @@ -58,9 +63,10 @@ var ( logFile = flag.String("log", "", "log filename") // Possible inputs -- we could switch to a subcommand cli parser? - refs = flag.String("F", "", "path to refs input") - releases = flag.String("L", "", "path to release input") - wiki = flag.String("W", "", "path to wiki input") + refs = flag.String("F", "", "path to refs input") + releases = flag.String("L", "", "path to release input") + wiki = flag.String("W", "", "path to wiki input") + openLibrary = flag.String("O", "", "path to open library input") // Extra args. reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown") @@ -131,6 +137,14 @@ func main() { if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil { log.Fatal(err) } + case "ol": + o, f, err := xio.OpenTwo(*openLibrary, *refs) + if err != nil { + log.Fatal(err) + } + if err := skate.ZippyRefsOpenLibrary(o, f, bw); err != nil { + log.Fatal(err) + } default: log.Fatalf("invalid mode") } diff --git a/skate/zippy.go b/skate/zippy.go index f2dca98..194b8ba 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -159,6 +159,53 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { return zipper.Run() } +// ZippyRefsOpenLibrary takes a release and refs reader (tsv, with ident, key, doc) +// and will execute gf for each group found. +func ZippyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { + // Define a grouper, working on one set of refs and releases with the same + // key at a time. Here, we do verification and write out the generated + // biblioref. + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 1) + grouper = func(g *zipkey.Group) error { + var ( + re, pivot *Release + err error + ) + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + if pivot, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil { + return err + } + for _, line := range g.G1 { + if re, err = stringToRelease(cut(line, "\t", 2)); err != nil { + return err + } + result := Verify(pivot, re) + switch result.Status { + case StatusExact, StatusStrong: + if result.Reason == ReasonDOI { + continue + } + br := generateBiblioRef(re, pivot, result, "fuzzy") + if err := enc.Encode(br); err != nil { + return err + } + default: + // XXX: We want to add unmatched pieces as well; here? We + // probably want to do a single final pass to complete the + // dataset. + } + } + return nil + } + ) + zipper := zipkey.New(olr, refs, keyer, grouper) + return zipper.Run() +} + // makeKeyFunc creates a function that can be used as keyFunc, selecting a // column from fields separated by sep; column is 1-indexed. func makeKeyFunc(sep string, column int) func(string) (string, error) { -- cgit v1.2.3