aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--python/refcat/tasks.py2
-rw-r--r--skate/Makefile2
-rw-r--r--skate/README.md5
-rw-r--r--skate/cmd/skate-conv/main.go (renamed from skate/cmd/skate-ref-to-release/main.go)60
4 files changed, 44 insertions, 25 deletions
diff --git a/python/refcat/tasks.py b/python/refcat/tasks.py
index df2245f..bb2685d 100644
--- a/python/refcat/tasks.py
+++ b/python/refcat/tasks.py
@@ -965,7 +965,7 @@ class RefsToRelease(Refcat):
def run(self):
output = shellout("""
zstdcat -T0 {input} |
- skate-ref-to-release -w 24 -b 100000 |
+ skate-conv -f ref -w 24 -b 100000 |
zstd -T0 -c > {output}
""",
input=self.input().path)
diff --git a/skate/Makefile b/skate/Makefile
index 9bc70c2..255bc28 100644
--- a/skate/Makefile
+++ b/skate/Makefile
@@ -1,5 +1,5 @@
SHELL := /bin/bash
-TARGETS := skate-ref-to-release skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
+TARGETS := skate-conv skate-derive-key skate-cluster skate-verify skate-to-doi skate-bref-id skate-from-unstructured skate-wikipedia-doi skate-dot skate-map
PKGNAME := skate
.PHONY: test
diff --git a/skate/README.md b/skate/README.md
index 7effb89..d3a361c 100644
--- a/skate/README.md
+++ b/skate/README.md
@@ -78,9 +78,10 @@ Cubic surface 10.2140/ant.2007.1.393 {"type_of_citation" ...
> Takes a refs file and plucks out identifiers from unstructured field.
-* skate-ref-to-release
+* skate-conv
-> Converts a ref document to a release. Part of first run, merging refs and releases.
+> Converts a ref (or open library) document to a release. Part of first step,
+> merging refs and releases.
* skate-to-doi
diff --git a/skate/cmd/skate-ref-to-release/main.go b/skate/cmd/skate-conv/main.go
index d547e62..647472e 100644
--- a/skate/cmd/skate-ref-to-release/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -1,5 +1,9 @@
-// skate-ref-to-release converts a "ref" document to a "release" document.
+// skate-conv converts various schemas into releases. This should replace the
+// very specific skate-ref-to-release and the like.
//
+// $ skate-conv -f ref < FILE > FILE
+//
+// Currently source schemas: "ref", "ol", "rg"
package main
import (
@@ -10,19 +14,38 @@ import (
"strings"
"git.archive.org/martin/cgraph/skate"
- "github.com/miku/parallel"
-
+ "git.archive.org/martin/cgraph/skate/parallel"
json "github.com/segmentio/encoding/json"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
- fromFormat = flag.String("f", "ref", "import data shape")
+ fromFormat = flag.String("f", "ref", "import schema")
bytesNewline = []byte("\n")
+ f func([]byte) ([]byte, error)
)
+func main() {
+ flag.Parse()
+ switch *fromFormat {
+ case "ref":
+ f = refToRelease
+ case "rg":
+ f = rgSitemapToRelease
+ case "ol":
+ f = openLibraryToRelease
+ }
+ pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
+ pp.NumWorkers = *numWorkers
+ pp.BatchSize = *batchSize
+ if err := pp.Run(); err != nil {
+ log.Fatal(err)
+ }
+}
+
+// refToRelease converts a ref document to a release.
func refToRelease(p []byte) ([]byte, error) {
var ref skate.Ref
if err := json.Unmarshal(p, &ref); err != nil {
@@ -60,22 +83,17 @@ func rgSitemapToRelease(p []byte) ([]byte, error) {
return b, err
}
-func main() {
- flag.Parse()
- switch *fromFormat {
- case "ref":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, refToRelease)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
- case "rg":
- pp := parallel.NewProcessor(os.Stdin, os.Stdout, rgSitemapToRelease)
- pp.NumWorkers = *numWorkers
- pp.BatchSize = *batchSize
- if err := pp.Run(); err != nil {
- log.Fatal(err)
- }
+func openLibraryToRelease(p []byte) ([]byte, error) {
+ var w skate.OpenLibraryWork
+ if err := json.Unmarshal(p, &w); err != nil {
+ return nil, err
}
+ release, err := skate.OpenLibraryToRelease(&w)
+ if err != nil {
+ return nil, err
+ }
+ release.Extra.Skate.Status = "ol"
+ b, err := json.Marshal(release)
+ b = append(b, bytesNewline...)
+ return b, err
}