aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/tasks.py14
-rw-r--r--skate/cmd/skate-biblioref-from-wikipedia/main.go2
-rw-r--r--skate/cmd/skate-verify/main.go27
-rw-r--r--skate/verify.go50
4 files changed, 87 insertions, 6 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index df56b9d..fbed8ca 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -1429,10 +1429,20 @@ class MAGDOI(Refcat):
# ==== WikipediaCitations
-class BiblioRefWikipediaCitations(Refcat):
+class BiblioRefWikiDOISortedKeys(Refcat):
"""
- Generate a biblioref schema from wikipedia citations minimal file.
+ Sorted DOI keys from wikipedia.
"""
def requires(self):
return WikipediaCitationsMinimalDataset()
+
+ def run(self):
+ output = shellout("cat {input} |
+ skate-biblioref-from-wikipedia |
+ LC_ALL=C sort -s 10% -k2,2 |
+ zstd -T0 -c > {output}
+ """, input=self.input().path)
+
+ def output(self):
+ return luigi.LocalTarget(path=self.path(ext="tsv.zst"), format=Zstd)
diff --git a/skate/cmd/skate-biblioref-from-wikipedia/main.go b/skate/cmd/skate-biblioref-from-wikipedia/main.go
index b51c953..e598491 100644
--- a/skate/cmd/skate-biblioref-from-wikipedia/main.go
+++ b/skate/cmd/skate-biblioref-from-wikipedia/main.go
@@ -30,7 +30,7 @@ func main() {
if idl.DOI == "" {
return nil, nil
}
- s := fmt.Sprintf("%s\t%s", idl.DOI, string(p))
+ s := fmt.Sprintf("%s\t%s\t%s", w.PageTitle, idl.DOI, string(p))
return []byte(s), nil
})
diff --git a/skate/cmd/skate-verify/main.go b/skate/cmd/skate-verify/main.go
index e6fc417..e59d263 100644
--- a/skate/cmd/skate-verify/main.go
+++ b/skate/cmd/skate-verify/main.go
@@ -16,17 +16,18 @@ import (
"runtime/pprof"
"strings"
- jsoniter "github.com/json-iterator/go"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
+ jsoniter "github.com/json-iterator/go"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 10000, "batch size")
- mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip")
+ mode = flag.String("m", "ref", "mode: exact, ref, bref, zip, bzip, wiki")
exactReason = flag.String("r", "", "doi, pmid, pmcid, arxiv")
provenance = flag.String("p", "join", "provenance info")
+ wikiFile = flag.String("W", "", "wiki citation file")
releasesFile = flag.String("R", "", "releases, tsv, sorted by key (zip mode only)")
refsFile = flag.String("F", "", "refs, tsv, sorted by key (zip mode only)")
cpuProfile = flag.String("cpuprofile", "", "write cpu profile to file")
@@ -90,7 +91,7 @@ func main() {
// Take two "sorted key files" (one refs, one releases) and run
// verification across groups, generate biblioref file.
if *refsFile == "" || *releasesFile == "" {
- log.Fatal("zip mode requires -R and -F to be set")
+ log.Fatal("zip mode requires -F and -R to be set")
}
f, err := os.Open(*releasesFile)
if err != nil {
@@ -123,6 +124,26 @@ func main() {
if err := pp.Run(); err != nil {
log.Fatal(err)
}
+ case "wiki":
+ // Fixed zip mode for DOI from wikipedia.
+ if *wikiFile == "" || *releasesFile == "" {
+ log.Fatal("mode requires -W and -F to be set")
+ }
+ f, err := os.Open(*releasesFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ g, err := os.Open(*wikiFile)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer g.Close()
+ bw := bufio.NewWriter(os.Stdout)
+ defer bw.Flush()
+ if err := skate.ZipUnverified(f, g, skate.MatchResult{skate.StatusExact, skate.ReasonDOI}, "wiki", bw); err != nil {
+ log.Fatal(err)
+ }
default:
log.Fatal("not implemented, only: zip, ref, bref")
}
diff --git a/skate/verify.go b/skate/verify.go
index cd40279..e6eb8b8 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -247,6 +247,51 @@ func ZipUnverified(releases, refs io.Reader, mr MatchResult, provenance string,
return zipper.Run()
}
+// ZipWikiUnverified takes a release and wiki reader (tsv, with ident, key, doc)
+// and assigns a fixed match result.
+func ZipWikiUnverified(releases, wiki io.Reader, mr MatchResult, provenance string, w io.Writer) error {
+ // Define a grouper, working on one set of refs and releases with the same
+ // key at a time. Here, we do verification and write out the generated
+ // biblioref.
+ enc := json.NewEncoder(w)
+ keyer := func(s string) (string, error) {
+ if k := lineColumn(s, "\t", 2); k == "" {
+ return k, fmt.Errorf("cannot get key: %s", s)
+ } else {
+ return k, nil
+ }
+ }
+ grouper := func(g *zipkey.Group) error {
+ if len(g.G0) == 0 || len(g.G1) == 0 {
+ return nil
+ }
+ target, err := stringToRelease(lineColumn(g.G0[0], "\t", 3))
+ if err != nil {
+ return err
+ }
+ for _, line := range g.G1 {
+ wiki, err := stringToWiki(lineColumn(line, "\t", 3))
+ if err != nil {
+ return err
+ }
+ var bref BiblioRef
+ bref.Key = fmt.Sprintf("%s_%s", slugifyString(wiki.PageTitle), target.Ident) // XXX: what should we use?
+ bref.SourceWikipediaArticle = wiki.PageTitle
+ bref.TargetReleaseIdent = target.Ident
+ bref.TargetWorkIdent = target.WorkID
+ bref.MatchProvenance = provenance
+ bref.MatchStatus = mr.Status.Short()
+ bref.MatchReason = mr.Reason.Short()
+ if err := enc.Encode(bref); err != nil {
+ return err
+ }
+ }
+ return nil
+ }
+ zipper := zipkey.New(releases, wiki, keyer, grouper)
+ return zipper.Run()
+}
+
// ZipVerifyRefs takes a release and refs reader (tsv, with ident, key, doc)
// and will execute gf for each group found.
func ZipVerifyRefs(releases, refs io.Reader, w io.Writer) error {
@@ -313,6 +358,11 @@ func stringToRef(s string) (r *Ref, err error) {
return
}
+func stringToWiki(s string) (r *MinimalCitations, err error) {
+ err = json.Unmarshal([]byte(s), &r)
+ return
+}
+
// Verify follows the fuzzycat (Python) implementation of this function: it
// compares two release entities. The Go version can be used for large batch
// processing (where the Python version might take two or more days).