From 048641f7672e2ff04a80d4486d0c21bd61369f0f Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Wed, 5 May 2021 16:12:55 +0200 Subject: skate-conv tweaks --- skate/cmd/skate-conv/main.go | 53 +++++++++++++++++-------------- skate/cmd/skate-from-unstructured/main.go | 2 +- 2 files changed, 31 insertions(+), 24 deletions(-) (limited to 'skate/cmd') diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go index 647472e..34e79a3 100644 --- a/skate/cmd/skate-conv/main.go +++ b/skate/cmd/skate-conv/main.go @@ -21,10 +21,9 @@ import ( var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import schema") + fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol") - bytesNewline = []byte("\n") - f func([]byte) ([]byte, error) + f func([]byte) ([]byte, error) // our converter function ) func main() { @@ -36,6 +35,8 @@ func main() { f = rgSitemapToRelease case "ol": f = openLibraryToRelease + default: + log.Fatal("unsupported input schema: %v", *fromFormat) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) pp.NumWorkers = *numWorkers @@ -45,55 +46,61 @@ func main() { } } -// refToRelease converts a ref document to a release. +// refToRelease converts a ref document to a release. The standard conversion +// plus some extra fields. func refToRelease(p []byte) ([]byte, error) { - var ref skate.Ref - if err := json.Unmarshal(p, &ref); err != nil { + var ( + ref skate.Ref + release *skate.Release + err error + ) + if err = json.Unmarshal(p, &ref); err != nil { return nil, err } - release, err := skate.RefToRelease(&ref) - if err != nil { + if release, err = skate.RefToRelease(&ref); err != nil { return nil, err } release.Extra.Skate.Status = "ref" // means: converted from ref release.Extra.Skate.Ref.Index = ref.Index release.Extra.Skate.Ref.Key = ref.Key - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err + return skate.JsonMarshalNewline(release) } +// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from +// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst. func rgSitemapToRelease(p []byte) ([]byte, error) { var ( s skate.Sitemap release skate.Release + err error ) - if err := json.Unmarshal(p, &s); err != nil { + if err = json.Unmarshal(p, &s); err != nil { return nil, err } release.Title = s.Title if len(s.URL) > 41 { - // XXX: A pseudo ident, maybe irritating. + // A pseudo ident, maybe irritating; we want the "321885388". + // https://www.researchgate.net/publication/321885388_We_came_here_on_dif release.Ident = strings.Split(s.URL[41:], "_")[0] } release.Extra.Skate.Status = "rg" release.Extra.Skate.ResearchGate.URL = s.URL - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err + return skate.JsonMarshalNewline(release) } +// openLibraryToRelease converts an Open Library work item to a release. func openLibraryToRelease(p []byte) ([]byte, error) { - var w skate.OpenLibraryWork - if err := json.Unmarshal(p, &w); err != nil { + var ( + w skate.OpenLibraryWork + release *skate.Release + err error + ) + if err = json.Unmarshal(p, &w); err != nil { return nil, err } - release, err := skate.OpenLibraryToRelease(&w) - if err != nil { + if release, err = skate.OpenLibraryToRelease(&w); err != nil { return nil, err } release.Extra.Skate.Status = "ol" - b, err := json.Marshal(release) - b = append(b, bytesNewline...) - return b, err + return skate.JsonMarshalNewline(release) } diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index 179057d..2ccdd7d 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -28,7 +28,7 @@ func main() { if err := skate.ParseUnstructured(&ref); err != nil { return nil, err } - return skate.JsonMarshalLine(&ref) + return skate.JsonMarshalNewline(&ref) }) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize -- cgit v1.2.3