From 77ca4cd924993188e0e9f8dd072af9f173eaad91 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 30 Apr 2021 22:43:07 +0200 Subject: rename skate-ref-to-release to skate-conv --- skate/cmd/skate-conv/main.go | 99 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 skate/cmd/skate-conv/main.go (limited to 'skate/cmd/skate-conv/main.go') diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go new file mode 100644 index 0000000..647472e --- /dev/null +++ b/skate/cmd/skate-conv/main.go @@ -0,0 +1,99 @@ +// skate-conv converts various schemas into releases. This should replace the +// very specific skate-ref-to-release and the like. +// +// $ skate-conv -f ref < FILE > FILE +// +// Currently source schemas: "ref", "ol", "rg" +package main + +import ( + "flag" + "log" + "os" + "runtime" + "strings" + + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" + json "github.com/segmentio/encoding/json" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + fromFormat = flag.String("f", "ref", "import schema") + + bytesNewline = []byte("\n") + f func([]byte) ([]byte, error) +) + +func main() { + flag.Parse() + switch *fromFormat { + case "ref": + f = refToRelease + case "rg": + f = rgSitemapToRelease + case "ol": + f = openLibraryToRelease + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// refToRelease converts a ref document to a release. +func refToRelease(p []byte) ([]byte, error) { + var ref skate.Ref + if err := json.Unmarshal(p, &ref); err != nil { + return nil, err + } + release, err := skate.RefToRelease(&ref) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ref" // means: converted from ref + release.Extra.Skate.Ref.Index = ref.Index + release.Extra.Skate.Ref.Key = ref.Key + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} + +func rgSitemapToRelease(p []byte) ([]byte, error) { + var ( + s skate.Sitemap + release skate.Release + ) + if err := json.Unmarshal(p, &s); err != nil { + return nil, err + } + release.Title = s.Title + if len(s.URL) > 41 { + // XXX: A pseudo ident, maybe irritating. + release.Ident = strings.Split(s.URL[41:], "_")[0] + } + release.Extra.Skate.Status = "rg" + release.Extra.Skate.ResearchGate.URL = s.URL + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} + +func openLibraryToRelease(p []byte) ([]byte, error) { + var w skate.OpenLibraryWork + if err := json.Unmarshal(p, &w); err != nil { + return nil, err + } + release, err := skate.OpenLibraryToRelease(&w) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ol" + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} -- cgit v1.2.3