aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-03-30 03:11:52 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-03-30 03:15:25 +0200
commitcdd223181639653c5d79d4e08f6307412df2fc61 (patch)
treec349f213f2710a07de09975ddba00b51d5bcb826 /skate
parent2ba04744b80122f3b1a7e01130a5d9cea53462fb (diff)
downloadrefcat-cdd223181639653c5d79d4e08f6307412df2fc61.tar.gz
refcat-cdd223181639653c5d79d4e08f6307412df2fc61.zip
example task
Diffstat (limited to 'skate')
-rw-r--r--skate/cmd/skate-biblioref-from-wikipedia/main.go2
-rw-r--r--skate/cmd/skate-verify/main.go27
-rw-r--r--skate/verify.go50
3 files changed, 75 insertions, 4 deletions
diff --git a/skate/cmd/skate-biblioref-from-wikipedia/main.go b/skate/cmd/skate-biblioref-from-wikipedia/main.go
index b51c953..e598491 100644
--- a/skate/cmd/skate-biblioref-from-wikipedia/main.go
+++ b/skate/cmd/skate-biblioref-from-wikipedia/main.go
@@ -30,7 +30,7 @@ func main() {
if idl.DOI == "" {
return nil, nil
}
- s := fmt.Sprintf("%s\t%s", idl.DOI, string(p))
+ s := fmt.Sprintf("%s\t%s\t%s", w.PageTitle, idl.DOI, string(p))
return []byte(s), nil
})
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
index e6fc417..e59d263 100644
--- a/skate/cmd/skate-verify/main.go
+++ b/skate/cmd/skate-verify/main.go
@@ -16,17 +16,18 @@ import (
"runtime/pprof"
"strings"
- jsoniter "github.com/json-iterator/go"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
+ jsoniter "github.com/json-iterator/go"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 10000, "batch size")
- mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip")
+ mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip, wiki")
exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv")
provenance = flag.String("p", "join", "provenance info")
+ wikiFile = flag.String("W", "", "wiki citation file")
releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)")
refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)")
cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
@@ -90,7 +91,7 @@ func main() {
// Take two "sorted key files" (one refs, one releases) and run
// verification across groups, generate biblioref file.
if *refsFile == "" || *releasesFile == "" {
- log.Fatal("zip mode requires -R and -F to be set")
+ log.Fatal("zip mode requires -F and -R to be set")
}
f, err := os.Open(*releasesFile)
if err != nil {
@@ -123,6 +124,26 @@ func main() {
if err := pp.Run(); err != nil {
log.Fatal(err)
}
+ case "wiki":
+ // Fixed zip mode for DOI from wikipedia.
+ if *wikiFile == "" || *releasesFile == "" {
+ log.Fatal("mode requires -W and -F to be set")
+ }
+ f, err := os.Open(*releasesFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ g, err := os.Open(*wikiFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer g.Close()
+ bw := bufio.NewWriter(os.Stdout)
+ defer bw.Flush()
+ if err := skate.ZipUnverified(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil {
+ log.Fatal(err)
+ }
default:
log.Fatal("not implemented, only: zip, ref, bref")
}
diff --git a/skate/verify.go b/skate/verify.go
index cd40279..e6eb8b8 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -247,6 +247,51 @@ func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string,
return zipper.Run()
}
+// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc)
+// and assigns a fixed match result.
+func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ enc := json.NewEncoder(w)
+ keyer := func(s string) (string, error) {
+ if k := lineColumn(s, "\t", 2); k == "" {
+ return k, fmt.Errorf("cannot get key: %s", s)
+ } else {
+ return k, nil
+ }
+ }
+ grouper := func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ wiki, err := stringToWiki(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
+ bref.SourceWikipediaArticle = wiki.PageTitle
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = mr.Status.Short()
+ bref.MatchReason = mr.Reason.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+ zipper := zipkey.New(releases, wiki, keyer, grouper)
+ return zipper.Run()
+}
+
// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
// and will execute gf for each group found.
func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error {
@@ -313,6 +358,11 @@ func stringToRef(s string) (r *Ref, err error) {
return
}
+func stringToWiki(s string) (r *MinimalCitations, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
// Verify follows the fuzzycat (Python) implementation of this function: it
// compares two release entities. The Go version can be used for large batch
// processing (where the Python version might take two or more days).