aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-conv/main.go53
-rw-r--r--skate/cmd/skate-from-unstructured/main.go2
2 files changed, 31 insertions, 24 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go
index 647472e..34e79a3 100644
--- a/skate/cmd/skate-conv/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -21,10 +21,9 @@ import (
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
- fromFormat = flag.String("f", "ref", "import schema")
+ fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol")
- bytesNewline = []byte("\n")
- f func([]byte) ([]byte, error)
+ f func([]byte) ([]byte, error) // our converter function
)
func main() {
@@ -36,6 +35,8 @@ func main() {
f = rgSitemapToRelease
case "ol":
f = openLibraryToRelease
+ default:
+ log.Fatal("unsupported input schema: %v", *fromFormat)
}
pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
pp.NumWorkers = *numWorkers
@@ -45,55 +46,61 @@ func main() {
}
}
-// refToRelease converts a ref document to a release.
+// refToRelease converts a ref document to a release. The standard conversion
+// plus some extra fields.
func refToRelease(p []byte) ([]byte, error) {
- var ref skate.Ref
- if err := json.Unmarshal(p, &ref); err != nil {
+ var (
+ ref skate.Ref
+ release *skate.Release
+ err error
+ )
+ if err = json.Unmarshal(p, &ref); err != nil {
return nil, err
}
- release, err := skate.RefToRelease(&ref)
- if err != nil {
+ if release, err = skate.RefToRelease(&ref); err != nil {
return nil, err
}
release.Extra.Skate.Status = "ref" // means: converted from ref
release.Extra.Skate.Ref.Index = ref.Index
release.Extra.Skate.Ref.Key = ref.Key
- b, err := json.Marshal(release)
- b = append(b, bytesNewline...)
- return b, err
+ return skate.JsonMarshalNewline(release)
}
+// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from
+// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst.
func rgSitemapToRelease(p []byte) ([]byte, error) {
var (
s skate.Sitemap
release skate.Release
+ err error
)
- if err := json.Unmarshal(p, &s); err != nil {
+ if err = json.Unmarshal(p, &s); err != nil {
return nil, err
}
release.Title = s.Title
if len(s.URL) > 41 {
- // XXX: A pseudo ident, maybe irritating.
+ // A pseudo ident, maybe irritating; we want the "321885388".
+ // https://www.researchgate.net/publication/321885388_We_came_here_on_dif
release.Ident = strings.Split(s.URL[41:], "_")[0]
}
release.Extra.Skate.Status = "rg"
release.Extra.Skate.ResearchGate.URL = s.URL
- b, err := json.Marshal(release)
- b = append(b, bytesNewline...)
- return b, err
+ return skate.JsonMarshalNewline(release)
}
+// openLibraryToRelease converts an Open Library work item to a release.
func openLibraryToRelease(p []byte) ([]byte, error) {
- var w skate.OpenLibraryWork
- if err := json.Unmarshal(p, &w); err != nil {
+ var (
+ w skate.OpenLibraryWork
+ release *skate.Release
+ err error
+ )
+ if err = json.Unmarshal(p, &w); err != nil {
return nil, err
}
- release, err := skate.OpenLibraryToRelease(&w)
- if err != nil {
+ if release, err = skate.OpenLibraryToRelease(&w); err != nil {
return nil, err
}
release.Extra.Skate.Status = "ol"
- b, err := json.Marshal(release)
- b = append(b, bytesNewline...)
- return b, err
+ return skate.JsonMarshalNewline(release)
}
diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go
index 179057d..2ccdd7d 100644
--- a/skate/cmd/skate-from-unstructured/main.go
+++ b/skate/cmd/skate-from-unstructured/main.go
@@ -28,7 +28,7 @@ func main() {
if err := skate.ParseUnstructured(&ref); err != nil {
return nil, err
}
- return skate.JsonMarshalLine(&ref)
+ return skate.JsonMarshalNewline(&ref)
})
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize