aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/cmd/skate-conv/main.go53
-rw-r--r--skate/cmd/skate-from-unstructured/main.go2
-rw-r--r--skate/schema.go5
-rw-r--r--skate/verify.go6
4 files changed, 36 insertions, 30 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go
index 647472e..34e79a3 100644
--- a/skate/cmd/skate-conv/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -21,10 +21,9 @@ import (
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
- fromFormat = flag.String("f", "ref", "import schema")
+ fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol")
- bytesNewline = []byte("\n")
- f func([]byte) ([]byte, error)
+ f func([]byte) ([]byte, error) // our converter function
)
func main() {
@@ -36,6 +35,8 @@ func main() {
f = rgSitemapToRelease
case "ol":
f = openLibraryToRelease
+ default:
+ log.Fatal("unsupported input schema: %v", *fromFormat)
}
pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
pp.NumWorkers = *numWorkers
@@ -45,55 +46,61 @@ func main() {
}
}
-// refToRelease converts a ref document to a release.
+// refToRelease converts a ref document to a release. The standard conversion
+// plus some extra fields.
func refToRelease(p []byte) ([]byte, error) {
- var ref skate.Ref
- if err := json.Unmarshal(p, &ref); err != nil {
+ var (
+ ref skate.Ref
+ release *skate.Release
+ err error
+ )
+ if err = json.Unmarshal(p, &ref); err != nil {
return nil, err
}
- release, err := skate.RefToRelease(&ref)
- if err != nil {
+ if release, err = skate.RefToRelease(&ref); err != nil {
return nil, err
}
release.Extra.Skate.Status = "ref" // means: converted from ref
release.Extra.Skate.Ref.Index = ref.Index
release.Extra.Skate.Ref.Key = ref.Key
- b, err := json.Marshal(release)
- b = append(b, bytesNewline...)
- return b, err
+ return skate.JsonMarshalNewline(release)
}
+// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from
+// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst.
func rgSitemapToRelease(p []byte) ([]byte, error) {
var (
s skate.Sitemap
release skate.Release
+ err error
)
- if err := json.Unmarshal(p, &s); err != nil {
+ if err = json.Unmarshal(p, &s); err != nil {
return nil, err
}
release.Title = s.Title
if len(s.URL) > 41 {
- // XXX: A pseudo ident, maybe irritating.
+ // A pseudo ident, maybe irritating; we want the "321885388".
+ // https://www.researchgate.net/publication/321885388_We_came_here_on_dif
release.Ident = strings.Split(s.URL[41:], "_")[0]
}
release.Extra.Skate.Status = "rg"
release.Extra.Skate.ResearchGate.URL = s.URL
- b, err := json.Marshal(release)
- b = append(b, bytesNewline...)
- return b, err
+ return skate.JsonMarshalNewline(release)
}
+// openLibraryToRelease converts an Open Library work item to a release.
func openLibraryToRelease(p []byte) ([]byte, error) {
- var w skate.OpenLibraryWork
- if err := json.Unmarshal(p, &w); err != nil {
+ var (
+ w skate.OpenLibraryWork
+ release *skate.Release
+ err error
+ )
+ if err = json.Unmarshal(p, &w); err != nil {
return nil, err
}
- release, err := skate.OpenLibraryToRelease(&w)
- if err != nil {
+ if release, err = skate.OpenLibraryToRelease(&w); err != nil {
return nil, err
}
release.Extra.Skate.Status = "ol"
- b, err := json.Marshal(release)
- b = append(b, bytesNewline...)
- return b, err
+ return skate.JsonMarshalNewline(release)
}
diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go
index 179057d..2ccdd7d 100644
--- a/skate/cmd/skate-from-unstructured/main.go
+++ b/skate/cmd/skate-from-unstructured/main.go
@@ -28,7 +28,7 @@ func main() {
if err := skate.ParseUnstructured(&ref); err != nil {
return nil, err
}
- return skate.JsonMarshalLine(&ref)
+ return skate.JsonMarshalNewline(&ref)
})
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
diff --git a/skate/schema.go b/skate/schema.go
index 9f3af45..1878205 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -27,8 +27,7 @@ var (
}
)
-// RefToRelease converts a ref to a release. Set a extra.skate.status flag to
-// be able to distinguish converted entities later.
+// RefToRelease converts a ref to a release.
func RefToRelease(ref *Ref) (*Release, error) {
var (
release Release
@@ -201,7 +200,7 @@ type Release struct {
Relations []DataCiteRelation `json:"relations,omitempty"`
} `json:"datacite,omitempty"`
Skate struct {
- // Mark as converted (e.g. by setting status to "ref")
+ // Mark as converted from "ref", "rg" or other schemas.
Status string `json:"status,omitempty"`
// Carry the ref index and key around.
Ref struct {
diff --git a/skate/verify.go b/skate/verify.go
index e6ab03e..d1f98f0 100644
--- a/skate/verify.go
+++ b/skate/verify.go
@@ -124,8 +124,8 @@ var (
PatPages = regexp.MustCompile(`([0-9]{1,})-([0-9]{1,})`)
)
-// JsonMarshalLine marshals a value as JSON and adds a newline.
-func JsonMarshalLine(v interface{}) ([]byte, error) {
+// JsonMarshalNewline marshals a value as JSON and adds a newline.
+func JsonMarshalNewline(v interface{}) ([]byte, error) {
b, err := json.Marshal(v)
if err != nil {
return nil, err
@@ -234,7 +234,7 @@ func RefClusterToBiblioRef(p []byte) ([]byte, error) {
continue // Assume we already have the DOI matches.
}
br = generateBiblioRef(re, pivot, result, "fuzzy")
- return JsonMarshalLine(br)
+ return JsonMarshalNewline(br)
default:
continue
}