From 77ca4cd924993188e0e9f8dd072af9f173eaad91 Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Fri, 30 Apr 2021 22:43:07 +0200 Subject: rename skate-ref-to-release to skate-conv --- python/refcat/tasks.py | 2 +- skate/Makefile | 2 +- skate/README.md | 5 +- skate/cmd/skate-conv/main.go | 99 ++++++++++++++++++++++++++++++++++ skate/cmd/skate-ref-to-release/main.go | 81 ---------------------------- 5 files changed, 104 insertions(+), 85 deletions(-) create mode 100644 skate/cmd/skate-conv/main.go delete mode 100644 skate/cmd/skate-ref-to-release/main.go diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index df2245f..bb2685d 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -965,7 +965,7 @@ class RefsToRelease(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | - skate-ref-to-release -w 24 -b 100000 | + skate-conv -f ref -w 24 -b 100000 | zstd -T0 -c > {output} """, input=self.input().path) diff --git a/skate/Makefile b/skate/Makefile index 9bc70c2..255bc28 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map +TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map PKGNAME := skate .PHONY: test diff --git a/skate/README.md b/skate/README.md index 7effb89..d3a361c 100644 --- a/skate/README.md +++ b/skate/README.md @@ -78,9 +78,10 @@ Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ... > Takes a refs file and plucks out identifiers from unstructured field. -* skate-ref-to-release +* skate-conv -> Converts a ref document to a release. Part of first run, merging refs and releases. +> Converts a ref (or open library) document to a release. Part of first step, +> merging refs and releases. * skate-to-doi diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go new file mode 100644 index 0000000..647472e --- /dev/null +++ b/skate/cmd/skate-conv/main.go @@ -0,0 +1,99 @@ +// skate-conv converts various schemas into releases. This should replace the +// very specific skate-ref-to-release and the like. +// +// $ skate-conv -f ref < FILE > FILE +// +// Currently source schemas: "ref", "ol", "rg" +package main + +import ( + "flag" + "log" + "os" + "runtime" + "strings" + + "git.archive.org/martin/cgraph/skate" + "git.archive.org/martin/cgraph/skate/parallel" + json "github.com/segmentio/encoding/json" +) + +var ( + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + fromFormat = flag.String("f", "ref", "import schema") + + bytesNewline = []byte("\n") + f func([]byte) ([]byte, error) +) + +func main() { + flag.Parse() + switch *fromFormat { + case "ref": + f = refToRelease + case "rg": + f = rgSitemapToRelease + case "ol": + f = openLibraryToRelease + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// refToRelease converts a ref document to a release. +func refToRelease(p []byte) ([]byte, error) { + var ref skate.Ref + if err := json.Unmarshal(p, &ref); err != nil { + return nil, err + } + release, err := skate.RefToRelease(&ref) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ref" // means: converted from ref + release.Extra.Skate.Ref.Index = ref.Index + release.Extra.Skate.Ref.Key = ref.Key + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} + +func rgSitemapToRelease(p []byte) ([]byte, error) { + var ( + s skate.Sitemap + release skate.Release + ) + if err := json.Unmarshal(p, &s); err != nil { + return nil, err + } + release.Title = s.Title + if len(s.URL) > 41 { + // XXX: A pseudo ident, maybe irritating. + release.Ident = strings.Split(s.URL[41:], "_")[0] + } + release.Extra.Skate.Status = "rg" + release.Extra.Skate.ResearchGate.URL = s.URL + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} + +func openLibraryToRelease(p []byte) ([]byte, error) { + var w skate.OpenLibraryWork + if err := json.Unmarshal(p, &w); err != nil { + return nil, err + } + release, err := skate.OpenLibraryToRelease(&w) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ol" + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err +} diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-ref-to-release/main.go deleted file mode 100644 index d547e62..0000000 --- a/skate/cmd/skate-ref-to-release/main.go +++ /dev/null @@ -1,81 +0,0 @@ -// skate-ref-to-release converts a "ref" document to a "release" document. -// -package main - -import ( - "flag" - "log" - "os" - "runtime" - "strings" - - "git.archive.org/martin/cgraph/skate" - "github.com/miku/parallel" - - json "github.com/segmentio/encoding/json" -) - -var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import data shape") - - bytesNewline = []byte("\n") -) - -func refToRelease(p []byte) ([]byte, error) { - var ref skate.Ref - if err := json.Unmarshal(p, &ref); err != nil { - return nil, err - } - release, err := skate.RefToRelease(&ref) - if err != nil { - return nil, err - } - release.Extra.Skate.Status = "ref" // means: converted from ref - release.Extra.Skate.Ref.Index = ref.Index - release.Extra.Skate.Ref.Key = ref.Key - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err -} - -func rgSitemapToRelease(p []byte) ([]byte, error) { - var ( - s skate.Sitemap - release skate.Release - ) - if err := json.Unmarshal(p, &s); err != nil { - return nil, err - } - release.Title = s.Title - if len(s.URL) > 41 { - // XXX: A pseudo ident, maybe irritating. - release.Ident = strings.Split(s.URL[41:], "_")[0] - } - release.Extra.Skate.Status = "rg" - release.Extra.Skate.ResearchGate.URL = s.URL - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err -} - -func main() { - flag.Parse() - switch *fromFormat { - case "ref": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "rg": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - } -} -- cgit v1.2.3