From 98f18b3a1044eed995a26019dc14a37ace5aa9be Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Tue, 30 Mar 2021 03:11:52 +0200 Subject: example task --- python/refcat/tasks.py | 14 ++++++- skate/cmd/skate-biblioref-from-wikipedia/main.go | 2 +- skate/cmd/skate-verify/main.go | 27 +++++++++++-- skate/verify.go | 50 ++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 6 deletions(-) diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index df56b9d..fbed8ca 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -1429,10 +1429,20 @@ class MAGDOI(Refcat): # ==== WikipediaCitations -class BiblioRefWikipediaCitations(Refcat): +class BiblioRefWikiDOISortedKeys(Refcat): """ - Generate a biblioref schema from wikipedia citations minimal file. + Sorted DOI keys from wikipedia. """ def requires(self): return WikipediaCitationsMinimalDataset() + + def run(self): + output = shellout("cat {input} | + skate-biblioref-from-wikipedia | + LC_ALL=C sort -s 10% -k2,2 | + zstd -T0 -c > {output} + """, input=self.input().path) + + def output(self): + return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd) diff --git a/skate/cmd/skate-biblioref-from-wikipedia/main.go b/skate/cmd/skate-biblioref-from-wikipedia/main.go index b51c953..e598491 100644 --- a/skate/cmd/skate-biblioref-from-wikipedia/main.go +++ b/skate/cmd/skate-biblioref-from-wikipedia/main.go @@ -30,7 +30,7 @@ func main() { if idl.DOI == "" { return nil, nil } - s := fmt.Sprintf("%s\t%s", idl.DOI, string(p)) + s := fmt.Sprintf("%s\t%s\t%s", w.PageTitle, idl.DOI, string(p)) return []byte(s), nil }) diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go index e6fc417..e59d263 100644 --- a/skate/cmd/skate-verify/main.go +++ b/skate/cmd/skate-verify/main.go @@ -16,17 +16,18 @@ import ( "runtime/pprof" "strings" - jsoniter "github.com/json-iterator/go" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" + jsoniter "github.com/json-iterator/go" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 10000, "batch size") - mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip") + mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip, wiki") exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv") provenance = flag.String("p", "join", "provenance info") + wikiFile = flag.String("W", "", "wiki citation file") releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)") refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)") cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file") @@ -90,7 +91,7 @@ func main() { // Take two "sorted key files" (one refs, one releases) and run // verification across groups, generate biblioref file. if *refsFile == "" || *releasesFile == "" { - log.Fatal("zip mode requires -R and -F to be set") + log.Fatal("zip mode requires -F and -R to be set") } f, err := os.Open(*releasesFile) if err != nil { @@ -123,6 +124,26 @@ func main() { if err := pp.Run(); err != nil { log.Fatal(err) } + case "wiki": + // Fixed zip mode for DOI from wikipedia. + if *wikiFile == "" || *releasesFile == "" { + log.Fatal("mode requires -W and -F to be set") + } + f, err := os.Open(*releasesFile) + if err != nil { + log.Fatal(err) + } + defer f.Close() + g, err := os.Open(*wikiFile) + if err != nil { + log.Fatal(err) + } + defer g.Close() + bw := bufio.NewWriter(os.Stdout) + defer bw.Flush() + if err := skate.ZipUnverified(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil { + log.Fatal(err) + } default: log.Fatal("not implemented, only: zip, ref, bref") } diff --git a/skate/verify.go b/skate/verify.go index cd40279..e6eb8b8 100644 --- a/skate/verify.go +++ b/skate/verify.go @@ -247,6 +247,51 @@ func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string, return zipper.Run() } +// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc) +// and assigns a fixed match result. +func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error { + // Define a grouper, working on one set of refs and releases with the same + // key at a time. Here, we do verification and write out the generated + // biblioref. + enc := json.NewEncoder(w) + keyer := func(s string) (string, error) { + if k := lineColumn(s, "\t", 2); k == "" { + return k, fmt.Errorf("cannot get key: %s", s) + } else { + return k, nil + } + } + grouper := func(g *zipkey.Group) error { + if len(g.G0) == 0 || len(g.G1) == 0 { + return nil + } + target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3)) + if err != nil { + return err + } + for _, line := range g.G1 { + wiki, err := stringToWiki(lineColumn(line, "\t", 3)) + if err != nil { + return err + } + var bref BiblioRef + bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use? + bref.SourceWikipediaArticle = wiki.PageTitle + bref.TargetReleaseIdent = target.Ident + bref.TargetWorkIdent = target.WorkID + bref.MatchProvenance = provenance + bref.MatchStatus = mr.Status.Short() + bref.MatchReason = mr.Reason.Short() + if err := enc.Encode(bref); err != nil { + return err + } + } + return nil + } + zipper := zipkey.New(releases, wiki, keyer, grouper) + return zipper.Run() +} + // ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc) // and will execute gf for each group found. func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error { @@ -313,6 +358,11 @@ func stringToRef(s string) (r *Ref, err error) { return } +func stringToWiki(s string) (r *MinimalCitations, err error) { + err = json.Unmarshal([]byte(s), &r) + return +} + // Verify follows the fuzzycat (Python) implementation of this function: it // compares two release entities. The Go version can be used for large batch // processing (where the Python version might take two or more days). -- cgit v1.2.3