aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-06 19:47:35 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-06 19:47:35 +0200
commitb46c9d351aa0a2a5c3618a1420259d4605a9654e (patch)
tree376f55f59baf657397210b2a2a3724b9ba61f6bb /skate
parentfb627564751d170c55b45785da569268b6290be9 (diff)
downloadrefcat-b46c9d351aa0a2a5c3618a1420259d4605a9654e.tar.gz
refcat-b46c9d351aa0a2a5c3618a1420259d4605a9654e.zip
move skate-verify to skate-reduce
Diffstat (limited to 'skate')
-rw-r--r--skate/Makefile2
-rw-r--r--skate/cmd/skate-reduce/main.go144
-rw-r--r--skate/cmd/skate-verify/main.go158
-rw-r--r--skate/xio/util.go14
-rw-r--r--skate/zippy.go28
5 files changed, 173 insertions, 173 deletions
diff --git a/skate/Makefile b/skate/Makefile
index f098ecd..97b192d 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
+TARGETS := skate-conv skate-cluster skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map skate-reduce
PKGNAME := skate
.PHONY: test
diff --git a/skate/cmd/skate-reduce/main.go b/skate/cmd/skate-reduce/main.go
new file mode 100644
index 0000000..df72ef4
--- /dev/null
+++ b/skate/cmd/skate-reduce/main.go
@@ -0,0 +1,144 @@
+// skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster)
+// and applies various verification and conversion functions. The output will
+// often be the biblioref schema.
+//
+// Support various modes.
+//
+// * exact: takes (key, doc) TSV files (one for releases, one for refs) and
+// will emit biblioref docs relating one element from releases with all
+// elements from ref; this is for "doi", "pmid" and other id matches, where no
+// further checks are necessary. The match reason, e.g. "doi" needs to be
+// supplied.
+//
+// $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv
+//
+// * verify: takes (key, doc) TSV files (one for release, one for refs), runs
+// verification within a group and will emit biblioref.
+//
+// $ skate-reduce -m verify -F a.tsv -L b.tsv
+//
+// * ref: takes a single file with clusters containing releases and refs and
+// will emit verification results.
+//
+// $ skate-reduce -m ref < a.ndj
+//
+// * bref: same as ref, but generate a biblioref file as output
+//
+// $ skate-reduce -m bref < a.ndj
+//
+// * wiki: zippy mode for releases and wikipedia inputs.
+//
+// $ skate-reduce -m wiki -L a.ndj -W b.ndj
+//
+package main
+
+import (
+ "bufio"
+ "flag"
+ "log"
+ "os"
+ "runtime"
+ "runtime/pprof"
+
+ "git.archive.org/martin/cgraph/skate"
+ "git.archive.org/martin/cgraph/skate/parallel"
+ "git.archive.org/martin/cgraph/skate/xio"
+)
+
+var (
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 10000, "batch size")
+ // Each mode may work on one or two files, and may need extra args.
+ mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki")
+
+ cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
+ memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
+
+ // Possible inputs, we could switch to a subcommand cli parser.
+ refs = flag.String("F", "", "path to refs input")
+ releases = flag.String("L", "", "path to release input")
+ wiki = flag.String("W", "", "path to wiki input")
+
+ // Extra args.
+ reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown")
+ reasonMap = map[string]skate.MatchResult{
+ "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI},
+ "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID},
+ "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID},
+ "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv},
+ "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown},
+ }
+)
+
+func main() {
+ flag.Parse()
+ if *cpuProfile != "" {
+ file, err := os.Create(*cpuProfile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ pprof.StartCPUProfile(file)
+ defer pprof.StopCPUProfile()
+ }
+
+ bw := bufio.NewWriter(os.Stdout)
+ defer bw.Flush()
+
+ switch *mode {
+ case "exact":
+ l, f, err := xio.OpenTwo(*releases, *refs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ r, ok := reasonMap[*reason]
+ if !ok {
+ log.Fatalf("unknown reason: %v", *reason)
+ }
+ if err := skate.ZippyExact(l, f, r, bw); err != nil {
+ log.Fatal(err)
+ }
+ case "verify":
+ l, f, err := xio.OpenTwo(*releases, *refs)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if err := skate.ZippyVerifyRefs(l, f, bw); err != nil {
+ log.Fatal(err)
+ }
+ case "ref":
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+ case "bref":
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+ case "wiki":
+ l, w, err := xio.OpenTwo(*releases, *wiki)
+ if err != nil {
+ log.Fatal(err)
+ }
+ if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil {
+ log.Fatal(err)
+ }
+ default:
+ log.Fatalf("invalid mode")
+ }
+ if *memProfile != "" {
+ f, err := os.Create(*memProfile)
+ if err != nil {
+ log.Fatal("could not create memory profile: ", err)
+ }
+ defer f.Close()
+ runtime.GC()
+ if err := pprof.WriteHeapProfile(f); err != nil {
+ log.Fatal(err)
+ }
+ }
+}
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
deleted file mode 100644
index 1288404..0000000
--- a/skate/cmd/skate-verify/main.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Generate pairs and run verification on larger number of records. Mimick
-// fuzzycat.verify, but make it faster (e.g. fuzzycat took about 50h for the
-// complete set).
-//
-// Currently: about 2h for 40M clusters (in "ref" mode).
-//
-// XXX: Cleanup inconsistent "modes".
-package main
-
-import (
- "bufio"
- "flag"
- "io"
- "log"
- "os"
- "runtime"
- "runtime/pprof"
- "strings"
-
- "git.archive.org/martin/cgraph/skate"
- "git.archive.org/martin/cgraph/skate/parallel"
-)
-
-var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 10000, "batch size")
- mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip, wiki")
- exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv")
- provenance = flag.String("p", "join", "provenance info")
- wikiFile = flag.String("W", "", "wiki citation file")
- releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)")
- refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)")
- cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
- memProfile = flag.String("memprofile", "", "write heap profile to file (go tool pprof -png --alloc_objects program mem.pprof > mem.png)")
-
- // XXX: This should be cleanup up soon.
- matchResults = map[string]skate.MatchResult{
- "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI},
- "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID},
- "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID},
- "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv},
- "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown},
- }
-)
-
-func main() {
- flag.Parse()
- if *cpuProfile != "" {
- file, err := os.Create(*cpuProfile)
- if err != nil {
- log.Fatal(err)
- }
- pprof.StartCPUProfile(file)
- defer pprof.StopCPUProfile()
- }
- var (
- f, g io.ReadCloser
- err error
- bw = bufio.NewWriter(os.Stdout)
- )
- defer bw.Flush()
- switch *mode {
- case "exact":
- // Fixed zip mode for DOI.
- if *refsFile == "" || *releasesFile == "" {
- log.Fatal("mode requires -R and -F to be set")
- }
- if *exactReason == "" {
- var keys []string
- for k := range matchResults {
- keys = append(keys, k)
- }
- log.Fatalf("need a reason for the record, one of: %s", strings.Join(keys, ", "))
- }
- if f, g, err = readersFromFilenames(*releasesFile, *refsFile); err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- defer g.Close()
- mr, ok := matchResults[*exactReason]
- if !ok {
- mr = matchResults["unknown"]
- }
- if err := skate.ZippyFixed(f, g, mr, *provenance, bw); err != nil {
- log.Fatal(err)
- }
- case "zip":
- // Take two "sorted key files" (one refs, one releases) and run
- // verification across groups, generate biblioref file.
- if *refsFile == "" || *releasesFile == "" {
- log.Fatal("zip mode requires -F and -R to be set")
- }
- if f, g, err = readersFromFilenames(*releasesFile, *refsFile); err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- defer g.Close()
- if err := skate.ZippyVerifyRefs(f, g, bw); err != nil {
- log.Fatal(err)
- }
- case "ref":
- // https://git.io/JtACz
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "bref":
- // generate biblioref
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "wiki":
- // Fixed zip mode for DOI from wikipedia.
- if *wikiFile == "" || *releasesFile == "" {
- log.Fatal("mode requires -W and -F to be set")
- }
- if f, g, err = readersFromFilenames(*releasesFile, *wikiFile); err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- defer g.Close()
- mr := skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI}
- if err = skate.ZippyFixedWiki(f, g, mr, "wiki", bw); err != nil {
- log.Fatal(err)
- }
- default:
- log.Fatal("not implemented, only: exact, zip, ref, bref, wiki")
- }
- if *memProfile != "" {
- f, err := os.Create(*memProfile)
- if err != nil {
- log.Fatal("could not create memory profile: ", err)
- }
- defer f.Close()
- runtime.GC()
- if err := pprof.WriteHeapProfile(f); err != nil {
- log.Fatal(err)
- }
- }
-}
-
-// readersFromFilenames lets the called check for a single error only.
-func readersFromFilenames(f0, f1 string) (io.ReadCloser, io.ReadCloser, error) {
- f, err := os.Open(f0)
- if err != nil {
- return nil, nil, err
- }
- g, err := os.Open(f1)
- if err != nil {
- return nil, nil, err
- }
- return f, g, nil
-}
diff --git a/skate/xio/util.go b/skate/xio/util.go
new file mode 100644
index 0000000..554317b
--- /dev/null
+++ b/skate/xio/util.go
@@ -0,0 +1,14 @@
+package xio
+
+import "os"
+
+// OpenTwo opens two files, and the caller needs to check for a single error only.
+func OpenTwo(f1, f2 string) (g1 *os.File, g2 *os.File, err error) {
+ if g1, err = os.Open(f1); err != nil {
+ return nil, nil, err
+ }
+ if g2, err = os.Open(f2); err != nil {
+ return nil, nil, err
+ }
+ return g1, g2, nil
+}
diff --git a/skate/zippy.go b/skate/zippy.go
index 6b1c373..c8d2aee 100644
--- a/skate/zippy.go
+++ b/skate/zippy.go
@@ -11,12 +11,12 @@ import (
// This file contains the two-stream (zippy) matchers.
-// ZippyFixed takes a release and refs reader (tsv, with ident, key, doc)
+// ZippyExact takes a release and refs reader (tsv, with ident, key, doc)
// and assigns a fixed match result.
-func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+func ZippyExact(releases, refs io.Reader, matchResult MatchResult, w io.Writer) error {
var (
enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
+ keyer = makeKeyFunc("\t", 1)
grouper = func(g *zipkey.Group) error {
var (
target *Release
@@ -26,11 +26,11 @@ func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w i
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
- if target, err = stringToRelease(cut(g.G0[0], "\t", 3)); err != nil {
+ if target, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil {
return err
}
for _, line := range g.G1 {
- if ref, err = stringToRef(cut(line, "\t", 3)); err != nil {
+ if ref, err = stringToRef(cut(line, "\t", 2)); err != nil {
return err
}
var bref BiblioRef
@@ -42,9 +42,9 @@ func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w i
bref.RefKey = ref.Key
bref.TargetReleaseIdent = target.Ident
bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
- bref.MatchStatus = mr.Status.Short()
- bref.MatchReason = mr.Reason.Short()
+ bref.MatchProvenance = ref.RefSource
+ bref.MatchStatus = matchResult.Status.Short()
+ bref.MatchReason = matchResult.Reason.Short()
if err := enc.Encode(bref); err != nil {
return err
}
@@ -56,12 +56,12 @@ func ZippyFixed(releases, refs io.Reader, mr MatchResult, provenance string, w i
return zipper.Run()
}
-// ZippyFixedWiki takes a release and wiki reader (tsv, with ident, key, doc)
+// ZippyExactWiki takes a release and wiki reader (tsv, with key, doc)
// and assigns a fixed match result.
-func ZippyFixedWiki(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+func ZippyExactWiki(releases, wiki io.Reader, mr MatchResult, w io.Writer) error {
var (
enc = json.NewEncoder(w)
- keyer = makeKeyFunc("\t", 2)
+ keyer = makeKeyFunc("\t", 1)
grouper = func(g *zipkey.Group) error {
var (
target *Release
@@ -71,11 +71,11 @@ func ZippyFixedWiki(releases, wiki io.Reader, mr MatchResult, provenance string,
if len(g.G0) == 0 || len(g.G1) == 0 {
return nil
}
- if target, err = stringToRelease(cut(g.G0[0], "\t", 3)); err != nil {
+ if target, err = stringToRelease(cut(g.G0[0], "\t", 2)); err != nil {
return err
}
for _, line := range g.G1 {
- if wiki, err = stringToWiki(cut(line, "\t", 3)); err != nil {
+ if wiki, err = stringToWiki(cut(line, "\t", 2)); err != nil {
return err
}
var bref BiblioRef
@@ -83,7 +83,7 @@ func ZippyFixedWiki(releases, wiki io.Reader, mr MatchResult, provenance string,
bref.SourceWikipediaArticle = wiki.PageTitle
bref.TargetReleaseIdent = target.Ident
bref.TargetWorkIdent = target.WorkID
- bref.MatchProvenance = provenance
+ bref.MatchProvenance = "wikipedia"
bref.MatchStatus = mr.Status.Short()
bref.MatchReason = mr.Reason.Short()
if err := enc.Encode(bref); err != nil {