// skate-reduce takes prepared inputs (e.g. from skate-map or skate-cluster) // and applies various verification and conversion functions. The output will // often be a stream of biblioref schema docs. // // Support various "modes", e.g. exact, verify, ref, bref, wiki. Each mode may // work on one or two files, and may need extra args. // // * exact | takes two (key, doc) TSV files (one for releases, one for refs) and // | will emit biblioref docs relating *one* element from releases with *all* // | elements from ref; this is for "doi", "pmid" and other id matches, where no // | further checks are necessary. The match reason, e.g. "doi" needs to be // | supplied. // | // | $ skate-reduce -m exact -r doi -F a.tsv -L b.tsv // | // | // * fuzzy | takes two (key, doc) TSV files (one for release, one for refs), // | runs verification within a group and will emit biblioref. // | // | $ skate-reduce -m fuzzy -F a.tsv -L b.tsv // | // | // * ref | takes a single file with clusters containing releases and refs and // | will emit verification results (deprecated). // | // | $ skate-reduce -m ref < a.ndj // | // | // * bref | same as ref, but generate a biblioref file as output (deprecated). // | // | $ skate-reduce -m bref < a.ndj // | // | // * wiki | zippy mode for releases and wikipedia inputs. // | // | $ skate-reduce -m wiki -L a.ndj -W b.ndj // | // | // * oledt | zippy mode for releases and OL inputs, dumps table for debugging. // | // | $ skate-reduce -m oled -F a.ndj -O b.ndj // | // | // * oled | zippy mode for releases and OL inputs, emit biblioref. // | // | $ skate-reduce -m oled -F a.ndj -O b.ndj // | // | // * rere | zippy mode for OL release and ref (as release) inputs, emit biblioref. // | // | $ skate-reduce -m rere -O a.ndj -F b.ndj // | // | // * unmatched | join matched and unmatched reference data; do deduplicate on the fly // | // | $ skate-reduce -m unmatched -B a.ndj -F b.ndj // | // | // * wb | join raw refs with urls with ad-hoc wayback json schema noting last timestamp // | // | $ skate-reduce -m wb -F a.ndj -C b.ndj // package main import ( "bufio" "flag" "fmt" "io/ioutil" "log" "os" "runtime" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" "git.archive.org/martin/cgraph/skate/xio" gzip "github.com/klauspost/compress/gzip" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 10000, "batch size") mode = flag.String("m", "ref", "mode, e.g. exact, fuzzy, ref, bref, wiki, oled, oledt, unmatched") quite = flag.Bool("q", false, "be quiet") logFile = flag.String("log", "", "log filename (stderr, if empty; logfile will be gzip compressed)") // Possible inputs -- we could switch to a subcommand cli parser? bref = flag.String("B", "", "path to bref file") refs = flag.String("F", "", "path to refs input") releases = flag.String("L", "", "path to release input") wiki = flag.String("W", "", "path to wiki input") openLibrary = flag.String("O", "", "path to open library input") // XXX: Too generic name. cdxAdhoc = flag.String("C", "", "path to cdx adhoc schema") // Extra args. reason = flag.String("r", "", "reason for match: doi, pmid, pmcid, arxiv, unknown") reasonMap = map[string]skate.MatchResult{ "doi": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonDOI}, "pmid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMID}, "pmcid": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonPMCID}, "arxiv": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonArxiv}, "unknown": skate.MatchResult{Status: skate.StatusUnknown, Reason: skate.ReasonUnknown}, "isbn": skate.MatchResult{Status: skate.StatusExact, Reason: skate.ReasonISBN}, } help = `Reduce modes TODO: table of modes and inputs, or specific link to docs ` ) func main() { flag.Usage = func() { fmt.Fprintf(flag.CommandLine.Output(), "Usage of %s:\n", os.Args[0]) flag.PrintDefaults() fmt.Println() fmt.Println(help) } flag.Parse() if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644) if err != nil { log.Fatal(err) } gzw := gzip.NewWriter(f) defer gzw.Flush() defer f.Close() log.SetOutput(gzw) } if *quite { log.SetOutput(ioutil.Discard) } bw := bufio.NewWriter(os.Stdout) defer bw.Flush() switch *mode { case "exact": l, f, err := xio.OpenTwo(*releases, *refs) if err != nil { log.Fatal(err) } r, ok := reasonMap[*reason] if !ok { log.Fatalf("unknown reason: %v", *reason) } if err := skate.ZippyExact(l, f, r, bw); err != nil { log.Fatal(err) } case "fuzzy": l, f, err := xio.OpenTwo(*releases, *refs) if err != nil { log.Fatal(err) } if err := skate.ZippyVerifyRefs(l, f, bw); err != nil { log.Fatal(err) } case "ref": pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterVerify) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { log.Fatal(err) } case "bref": pp := parallel.NewProcessor(os.Stdin, os.Stdout, skate.RefClusterToBiblioRef) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { log.Fatal(err) } case "wiki": l, w, err := xio.OpenTwo(*releases, *wiki) if err != nil { log.Fatal(err) } if err := skate.ZippyExactWiki(l, w, reasonMap["doi"], bw); err != nil { log.Fatal(err) } case "oledt": o, f, err := xio.OpenTwo(*openLibrary, *refs) if err != nil { log.Fatal(err) } if err := skate.ZippyVerifyRefsOpenLibraryTable(o, f, bw); err != nil { log.Fatal(err) } case "oled": o, f, err := xio.OpenTwo(*openLibrary, *refs) if err != nil { log.Fatal(err) } if err := skate.ZippyVerifyRefsOpenLibrary(o, f, bw); err != nil { log.Fatal(err) } case "rere": o, f, err := xio.OpenTwo(*releases, *refs) if err != nil { log.Fatal(err) } r, ok := reasonMap[*reason] if !ok { log.Fatalf("unknown reason: %v", *reason) } if err := skate.ZippyExactReleases(o, f, r, bw); err != nil { log.Fatal(err) } case "unmatched": b, f, err := xio.OpenTwo(*bref, *refs) if err != nil { log.Fatal(err) } if err := skate.ZippyBrefAugment(b, f, bw); err != nil { log.Fatal(err) } case "wb": f, c, err := xio.OpenTwo(*refs, *cdxAdhoc) if err != nil { log.Fatal(err) } if err := skate.ZippyWayback(f, c, bw); err != nil { log.Fatal(err) } default: log.Fatalf("invalid mode") } }