1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
// skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster)
// and applies various verification and conversion functions. The output will
// often be a stream of biblioref schema docs.
//
// Support various "modes", e.g. exact, verify, ref, bref, wiki. Each mode may
// work on one or two files, and may need extra args.
//
// * exact | takes two (key, doc) TSV files (one for releases, one for refs) and
// | will emit biblioref docs relating *one* element from releases with *all*
// | elements from ref; this is for "doi", "pmid" and other id matches, where no
// | further checks are necessary. The match reason, e.g. "doi" needs to be
// | supplied.
// |
// | $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv
// |
// |
// * verify | takes two (key, doc) TSV files (one for release, one for refs),
// | runs verification within a group and will emit biblioref.
// |
// | $ skate-reduce -m verify -F a.tsv -L b.tsv
// |
// |
// * ref | takes a single file with clusters containing releases and refs and
// | will emit verification results.
// |
// | $ skate-reduce -m ref < a.ndj
// |
// |
// * bref | same as ref, but generate a biblioref file as output
// |
// | $ skate-reduce -m bref < a.ndj
// |
// |
// * wiki | zippy mode for releases and wikipedia inputs.
// |
// | $ skate-reduce -m wiki -L a.ndj -W b.ndj
//
package main
import (
"bufio"
"flag"
"log"
"os"
"runtime"
"git.archive.org/martin/cgraph/skate"
"git.archive.org/martin/cgraph/skate/parallel"
"git.archive.org/martin/cgraph/skate/xio"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 10000, "batch size")
mode = flag.String("m", "ref", "mode, e.g. exact, verify, ref, bref, wiki")
// Possible inputs -- we could switch to a subcommand cli parser?
refs = flag.String("F", "", "path to refs input")
releases = flag.String("L", "", "path to release input")
wiki = flag.String("W", "", "path to wiki input")
// Extra args.
reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown")
reasonMap = map[string]skate.MatchResult{
"doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI},
"pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID},
"pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID},
"arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv},
"unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown},
}
)
func main() {
flag.Parse()
bw := bufio.NewWriter(os.Stdout)
defer bw.Flush()
switch *mode {
case "exact":
l, f, err := xio.OpenTwo(*releases, *refs)
if err != nil {
log.Fatal(err)
}
r, ok := reasonMap[*reason]
if !ok {
log.Fatalf("unknown reason: %v", *reason)
}
if err := skate.ZippyExact(l, f, r, bw); err != nil {
log.Fatal(err)
}
case "verify":
l, f, err := xio.OpenTwo(*releases, *refs)
if err != nil {
log.Fatal(err)
}
if err := skate.ZippyVerifyRefs(l, f, bw); err != nil {
log.Fatal(err)
}
case "ref":
pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
if err := pp.Run(); err != nil {
log.Fatal(err)
}
case "bref":
pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
if err := pp.Run(); err != nil {
log.Fatal(err)
}
case "wiki":
l, w, err := xio.OpenTwo(*releases, *wiki)
if err != nil {
log.Fatal(err)
}
if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil {
log.Fatal(err)
}
default:
log.Fatalf("invalid mode")
}
}
|