diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-22 00:40:36 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-22 00:40:36 +0200 |
commit | 131595eb60457115dbe9d3bc7c9dedb4253a17ed (patch) | |
tree | 3c9fade2168e36afac265895058d8e0d5d5fb46c | |
parent | d0413165c6259d8c69f04948ceb649eb40910d0b (diff) | |
download | refcat-131595eb60457115dbe9d3bc7c9dedb4253a17ed.tar.gz refcat-131595eb60457115dbe9d3bc7c9dedb4253a17ed.zip |
wip: ol fuzzy matching
-rw-r--r-- | python/refcat/tasks.py | 26 | ||||
-rw-r--r-- | skate/cmd/skate-reduce/main.go | 20 | ||||
-rw-r--r-- | skate/zippy.go | 47 |
3 files changed, 89 insertions, 4 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index d17a1f9..aaa0551 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -711,7 +711,7 @@ class OpenLibraryWorks(Refcat): class OpenLibraryMapped(Refcat): """ - Map OL data with e.g. a title normalizing mapper. + Map OL data with e.g. a title normalizing mapper. 4m8.341s. """ mapper = luigi.Parameter(default="ts", description="mapper short name") @@ -734,3 +734,27 @@ class OpenLibraryMapped(Refcat): def output(self): return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) + +class BrefZipFuzzyOpenLibrary(Refcat): + """ + Fuzzy matching from refs to open library (both converted to release schema). + """ + mapper = luigi.Parameter(default="ts", description="mapper short name") + + def requires(self): + return { + "refs": RefsMapped(mapper=self.mapper), + "ol": OpenLibraryMapped(mapper=self.mapper), + } + + def run(self): + output = shellout(r""" + skate-reduce -m ol -F <(zstdcat -T0 {refs}) -O <(zstdcat -T0 {ol}) | + zstd -c -T0 > {output} + """, + refs=self.input().get("refs").path, + ol=self.input().get("ol").path) + luigi.LocalTarget(output).move(self.output().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd) diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go index ca87994..19e5fb5 100644 --- a/skate/cmd/skate-reduce/main.go +++ b/skate/cmd/skate-reduce/main.go @@ -34,6 +34,11 @@ // * wiki | zippy mode for releases and wikipedia inputs. // | // | $ skate-reduce -m wiki -L a.ndj -W b.ndj +// | +// | +// * ol | zippy mode for releases and OL inputs. +// | +// | $ skate-reduce -m ol -F a.ndj -O b.ndj // package main @@ -58,9 +63,10 @@ var ( logFile = flag.String("log", "", "log filename") // Possible inputs -- we could switch to a subcommand cli parser? - refs = flag.String("F", "", "path to refs input") - releases = flag.String("L", "", "path to release input") - wiki = flag.String("W", "", "path to wiki input") + refs = flag.String("F", "", "path to refs input") + releases = flag.String("L", "", "path to release input") + wiki = flag.String("W", "", "path to wiki input") + openLibrary = flag.String("O", "", "path to open library input") // Extra args. reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown") @@ -131,6 +137,14 @@ func main() { if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil { log.Fatal(err) } + case "ol": + o, f, err := xio.OpenTwo(*openLibrary, *refs) + if err != nil { + log.Fatal(err) + } + if err := skate.ZippyRefsOpenLibrary(o, f, bw); err != nil { + log.Fatal(err) + } default: log.Fatalf("invalid mode") } diff --git a/skate/zippy.go b/skate/zippy.go index f2dca98..194b8ba 100644 --- a/skate/zippy.go +++ b/skate/zippy.go @@ -159,6 +159,53 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error { return zipper.Run() } +// ZippyRefsOpenLibrary takes a release and refs reader (tsv, with ident, key, doc) +// and will execute gf for each group found. +func ZippyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error { + // Define a grouper, working on one set of refs and releases with the same + // key at a time. Here, we do verification and write out the generated + // biblioref. + var ( + enc = json.NewEncoder(w) + keyer = makeKeyFunc("\t", 1) + grouper = func(g *zipkey.Group) error { + var ( + re, pivot *Release + err error + ) + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + if pivot, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil { + return err + } + for _, line := range g.G1 { + if re, err = stringToRelease(cut(line, "\t", 2)); err != nil { + return err + } + result := Verify(pivot, re) + switch result.Status { + case StatusExact, StatusStrong: + if result.Reason == ReasonDOI { + continue + } + br := generateBiblioRef(re, pivot, result, "fuzzy") + if err := enc.Encode(br); err != nil { + return err + } + default: + // XXX: We want to add unmatched pieces as well; here? We + // probably want to do a single final pass to complete the + // dataset. + } + } + return nil + } + ) + zipper := zipkey.New(olr, refs, keyer, grouper) + return zipper.Run() +} + // makeKeyFunc creates a function that can be used as keyFunc, selecting a // column from fields separated by sep; column is 1-indexed. func makeKeyFunc(sep string, column int) func(string) (string, error) { |