diff options
-rw-r--r-- | python/refcat/tasks.py | 2 | ||||
-rw-r--r-- | skate/Makefile | 2 | ||||
-rw-r--r-- | skate/README.md | 5 | ||||
-rw-r--r-- | skate/cmd/skate-conv/main.go (renamed from skate/cmd/skate-ref-to-release/main.go) | 60 |
4 files changed, 44 insertions, 25 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py index df2245f..bb2685d 100644 --- a/python/refcat/tasks.py +++ b/python/refcat/tasks.py @@ -965,7 +965,7 @@ class RefsToRelease(Refcat): def run(self): output = shellout(""" zstdcat -T0 {input} | - skate-ref-to-release -w 24 -b 100000 | + skate-conv -f ref -w 24 -b 100000 | zstd -T0 -c > {output} """, input=self.input().path) diff --git a/skate/Makefile b/skate/Makefile index 9bc70c2..255bc28 100644 --- a/skate/Makefile +++ b/skate/Makefile @@ -1,5 +1,5 @@ SHELL := /bin/bash -TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map +TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map PKGNAME := skate .PHONY: test diff --git a/skate/README.md b/skate/README.md index 7effb89..d3a361c 100644 --- a/skate/README.md +++ b/skate/README.md @@ -78,9 +78,10 @@ Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ... > Takes a refs file and plucks out identifiers from unstructured field. -* skate-ref-to-release +* skate-conv -> Converts a ref document to a release. Part of first run, merging refs and releases. +> Converts a ref (or open library) document to a release. Part of first step, +> merging refs and releases. * skate-to-doi diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-conv/main.go index d547e62..647472e 100644 --- a/skate/cmd/skate-ref-to-release/main.go +++ b/skate/cmd/skate-conv/main.go @@ -1,5 +1,9 @@ -// skate-ref-to-release converts a "ref" document to a "release" document. +// skate-conv converts various schemas into releases. This should replace the +// very specific skate-ref-to-release and the like. // +// $ skate-conv -f ref < FILE > FILE +// +// Currently source schemas: "ref", "ol", "rg" package main import ( @@ -10,19 +14,38 @@ import ( "strings" "git.archive.org/martin/cgraph/skate" - "github.com/miku/parallel" - + "git.archive.org/martin/cgraph/skate/parallel" json "github.com/segmentio/encoding/json" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import data shape") + fromFormat = flag.String("f", "ref", "import schema") bytesNewline = []byte("\n") + f func([]byte) ([]byte, error) ) +func main() { + flag.Parse() + switch *fromFormat { + case "ref": + f = refToRelease + case "rg": + f = rgSitemapToRelease + case "ol": + f = openLibraryToRelease + } + pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) + pp.NumWorkers = *numWorkers + pp.BatchSize = *batchSize + if err := pp.Run(); err != nil { + log.Fatal(err) + } +} + +// refToRelease converts a ref document to a release. func refToRelease(p []byte) ([]byte, error) { var ref skate.Ref if err := json.Unmarshal(p, &ref); err != nil { @@ -60,22 +83,17 @@ func rgSitemapToRelease(p []byte) ([]byte, error) { return b, err } -func main() { - flag.Parse() - switch *fromFormat { - case "ref": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } - case "rg": - pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease) - pp.NumWorkers = *numWorkers - pp.BatchSize = *batchSize - if err := pp.Run(); err != nil { - log.Fatal(err) - } +func openLibraryToRelease(p []byte) ([]byte, error) { + var w skate.OpenLibraryWork + if err := json.Unmarshal(p, &w); err != nil { + return nil, err } + release, err := skate.OpenLibraryToRelease(&w) + if err != nil { + return nil, err + } + release.Extra.Skate.Status = "ol" + b, err := json.Marshal(release) + b = append(b, bytesNewline...) + return b, err } |