aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-22 00:40:36 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-22 00:40:36 +0200
commit131595eb60457115dbe9d3bc7c9dedb4253a17ed (patch)
tree3c9fade2168e36afac265895058d8e0d5d5fb46c
parentd0413165c6259d8c69f04948ceb649eb40910d0b (diff)
downloadrefcat-131595eb60457115dbe9d3bc7c9dedb4253a17ed.tar.gz
refcat-131595eb60457115dbe9d3bc7c9dedb4253a17ed.zip
wip: ol fuzzy matching
-rw-r--r--python/refcat/tasks.py26
-rw-r--r--skate/cmd/skate-reduce/main.go20
-rw-r--r--skate/zippy.go47
3 files changed, 89 insertions, 4 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index d17a1f9..aaa0551 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -711,7 +711,7 @@ class OpenLibraryWorks(Refcat):
class OpenLibraryMapped(Refcat):
"""
- Map OL data with e.g. a title normalizing mapper.
+ Map OL data with e.g. a title normalizing mapper. 4m8.341s.
"""
mapper = luigi.Parameter(default="ts", description="mapper short name")
@@ -734,3 +734,27 @@ class OpenLibraryMapped(Refcat):
def output(self):
return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
+
+class BrefZipFuzzyOpenLibrary(Refcat):
+ """
+ Fuzzy matching from refs to open library (both converted to release schema).
+ """
+ mapper = luigi.Parameter(default="ts", description="mapper short name")
+
+ def requires(self):
+ return {
+ "refs": RefsMapped(mapper=self.mapper),
+ "ol": OpenLibraryMapped(mapper=self.mapper),
+ }
+
+ def run(self):
+ output = shellout(r"""
+ skate-reduce -m ol -F <(zstdcat -T0 {refs}) -O <(zstdcat -T0 {ol}) |
+ zstd -c -T0 > {output}
+ """,
+ refs=self.input().get("refs").path,
+ ol=self.input().get("ol").path)
+ luigi.LocalTarget(output).move(self.output().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="json.zst"), format=Zstd)
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index ca87994..19e5fb5 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -34,6 +34,11 @@
// * wiki | zippy mode for releases and wikipedia inputs.
// |
// | $ skate-reduce -m wiki -L a.ndj -W b.ndj
+// |
+// |
+// * ol | zippy mode for releases and OL inputs.
+// |
+// | $ skate-reduce -m ol -F a.ndj -O b.ndj
//
package main
@@ -58,9 +63,10 @@ var (
logFile = flag.String("log", "", "log filename")
// Possible inputs -- we could switch to a subcommand cli parser?
- refs = flag.String("F", "", "path to refs input")
- releases = flag.String("L", "", "path to release input")
- wiki = flag.String("W", "", "path to wiki input")
+ refs = flag.String("F", "", "path to refs input")
+ releases = flag.String("L", "", "path to release input")
+ wiki = flag.String("W", "", "path to wiki input")
+ openLibrary = flag.String("O", "", "path to open library input")
// Extra args.
reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown")
@@ -131,6 +137,14 @@ func main() {
if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil {
log.Fatal(err)
}
+ case "ol":
+ o, f, err := xio.OpenTwo(*openLibrary, *refs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if err := skate.ZippyRefsOpenLibrary(o, f, bw); err != nil {
+ log.Fatal(err)
+ }
default:
log.Fatalf("invalid mode")
}
diff --git a/skate/zippy.go b/skate/zippy.go
index f2dca98..194b8ba 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -159,6 +159,53 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error {
return zipper.Run()
}
+// ZippyRefsOpenLibrary takes a release and refs reader (tsv, with ident, key, doc)
+// and will execute gf for each group found.
+func ZippyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ var (
+ enc = json.NewEncoder(w)
+ keyer = makeKeyFunc("\t", 1)
+ grouper = func(g *zipkey.Group) error {
+ var (
+ re, pivot *Release
+ err error
+ )
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ if pivot, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ if re, err = stringToRelease(cut(line, "\t", 2)); err != nil {
+ return err
+ }
+ result := Verify(pivot, re)
+ switch result.Status {
+ case StatusExact, StatusStrong:
+ if result.Reason == ReasonDOI {
+ continue
+ }
+ br := generateBiblioRef(re, pivot, result, "fuzzy")
+ if err := enc.Encode(br); err != nil {
+ return err
+ }
+ default:
+ // XXX: We want to add unmatched pieces as well; here? We
+ // probably want to do a single final pass to complete the
+ // dataset.
+ }
+ }
+ return nil
+ }
+ )
+ zipper := zipkey.New(olr, refs, keyer, grouper)
+ return zipper.Run()
+}
+
// makeKeyFunc creates a function that can be used as keyFunc, selecting a
// column from fields separated by sep; column is 1-indexed.
func makeKeyFunc(sep string, column int) func(string) (string, error) {