aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-22 00:40:36 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-22 00:40:36 +0200
commit131595eb60457115dbe9d3bc7c9dedb4253a17ed (patch)
tree3c9fade2168e36afac265895058d8e0d5d5fb46c /skate
parentd0413165c6259d8c69f04948ceb649eb40910d0b (diff)
downloadrefcat-131595eb60457115dbe9d3bc7c9dedb4253a17ed.tar.gz
refcat-131595eb60457115dbe9d3bc7c9dedb4253a17ed.zip
wip: ol fuzzy matching
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-reduce/main.go20
-rw-r--r--skate/zippy.go47
2 files changed, 64 insertions, 3 deletions
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
index ca87994..19e5fb5 100644
--- a/skate/cmd/skate-reduce/main.go
+++ b/skate/cmd/skate-reduce/main.go
@@ -34,6 +34,11 @@
// * wiki | zippy mode for releases and wikipedia inputs.
// |
// | $ skate-reduce -m wiki -L a.ndj -W b.ndj
+// |
+// |
+// * ol | zippy mode for releases and OL inputs.
+// |
+// | $ skate-reduce -m ol -F a.ndj -O b.ndj
//
package main
@@ -58,9 +63,10 @@ var (
logFile = flag.String("log", "", "log filename")
// Possible inputs -- we could switch to a subcommand cli parser?
- refs = flag.String("F", "", "path to refs input")
- releases = flag.String("L", "", "path to release input")
- wiki = flag.String("W", "", "path to wiki input")
+ refs = flag.String("F", "", "path to refs input")
+ releases = flag.String("L", "", "path to release input")
+ wiki = flag.String("W", "", "path to wiki input")
+ openLibrary = flag.String("O", "", "path to open library input")
// Extra args.
reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown")
@@ -131,6 +137,14 @@ func main() {
if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil {
log.Fatal(err)
}
+ case "ol":
+ o, f, err := xio.OpenTwo(*openLibrary, *refs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if err := skate.ZippyRefsOpenLibrary(o, f, bw); err != nil {
+ log.Fatal(err)
+ }
default:
log.Fatalf("invalid mode")
}
diff --git a/skate/zippy.go b/skate/zippy.go
index f2dca98..194b8ba 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -159,6 +159,53 @@ func ZippyVerifyRefs(releases, refs io.Reader, w io.Writer) error {
return zipper.Run()
}
+// ZippyRefsOpenLibrary takes a release and refs reader (tsv, with ident, key, doc)
+// and will execute gf for each group found.
+func ZippyRefsOpenLibrary(olr, refs io.Reader, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ var (
+ enc = json.NewEncoder(w)
+ keyer = makeKeyFunc("\t", 1)
+ grouper = func(g *zipkey.Group) error {
+ var (
+ re, pivot *Release
+ err error
+ )
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ if pivot, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ if re, err = stringToRelease(cut(line, "\t", 2)); err != nil {
+ return err
+ }
+ result := Verify(pivot, re)
+ switch result.Status {
+ case StatusExact, StatusStrong:
+ if result.Reason == ReasonDOI {
+ continue
+ }
+ br := generateBiblioRef(re, pivot, result, "fuzzy")
+ if err := enc.Encode(br); err != nil {
+ return err
+ }
+ default:
+ // XXX: We want to add unmatched pieces as well; here? We
+ // probably want to do a single final pass to complete the
+ // dataset.
+ }
+ }
+ return nil
+ }
+ )
+ zipper := zipkey.New(olr, refs, keyer, grouper)
+ return zipper.Run()
+}
+
// makeKeyFunc creates a function that can be used as keyFunc, selecting a
// column from fields separated by sep; column is 1-indexed.
func makeKeyFunc(sep string, column int) func(string) (string, error) {