diff options
Diffstat (limited to 'skate/cmd')
| -rw-r--r-- | skate/cmd/skate-conv/main.go | 53 | ||||
| -rw-r--r-- | skate/cmd/skate-from-unstructured/main.go | 2 | 
2 files changed, 31 insertions, 24 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go index 647472e..34e79a3 100644 --- a/skate/cmd/skate-conv/main.go +++ b/skate/cmd/skate-conv/main.go @@ -21,10 +21,9 @@ import (  var (  	numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")  	batchSize  = flag.Int("b", 100000, "batch size") -	fromFormat = flag.String("f", "ref", "import schema") +	fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol") -	bytesNewline = []byte("\n") -	f            func([]byte) ([]byte, error) +	f func([]byte) ([]byte, error) // our converter function  )  func main() { @@ -36,6 +35,8 @@ func main() {  		f = rgSitemapToRelease  	case "ol":  		f = openLibraryToRelease +	default: +		log.Fatal("unsupported input schema: %v", *fromFormat)  	}  	pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)  	pp.NumWorkers = *numWorkers @@ -45,55 +46,61 @@ func main() {  	}  } -// refToRelease converts a ref document to a release. +// refToRelease converts a ref document to a release. The standard conversion +// plus some extra fields.  func refToRelease(p []byte) ([]byte, error) { -	var ref skate.Ref -	if err := json.Unmarshal(p, &ref); err != nil { +	var ( +		ref     skate.Ref +		release *skate.Release +		err     error +	) +	if err = json.Unmarshal(p, &ref); err != nil {  		return nil, err  	} -	release, err := skate.RefToRelease(&ref) -	if err != nil { +	if release, err = skate.RefToRelease(&ref); err != nil {  		return nil, err  	}  	release.Extra.Skate.Status = "ref" // means: converted from ref  	release.Extra.Skate.Ref.Index = ref.Index  	release.Extra.Skate.Ref.Key = ref.Key -	b, err := json.Marshal(release) -	b = append(b, bytesNewline...) -	return b, err +	return skate.JsonMarshalNewline(release)  } +// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from +// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst.  func rgSitemapToRelease(p []byte) ([]byte, error) {  	var (  		s       skate.Sitemap  		release skate.Release +		err     error  	) -	if err := json.Unmarshal(p, &s); err != nil { +	if err = json.Unmarshal(p, &s); err != nil {  		return nil, err  	}  	release.Title = s.Title  	if len(s.URL) > 41 { -		// XXX: A pseudo ident, maybe irritating. +		// A pseudo ident, maybe irritating; we want the "321885388". +		// https://www.researchgate.net/publication/321885388_We_came_here_on_dif  		release.Ident = strings.Split(s.URL[41:], "_")[0]  	}  	release.Extra.Skate.Status = "rg"  	release.Extra.Skate.ResearchGate.URL = s.URL -	b, err := json.Marshal(release) -	b = append(b, bytesNewline...) -	return b, err +	return skate.JsonMarshalNewline(release)  } +// openLibraryToRelease converts an Open Library work item to a release.  func openLibraryToRelease(p []byte) ([]byte, error) { -	var w skate.OpenLibraryWork -	if err := json.Unmarshal(p, &w); err != nil { +	var ( +		w       skate.OpenLibraryWork +		release *skate.Release +		err     error +	) +	if err = json.Unmarshal(p, &w); err != nil {  		return nil, err  	} -	release, err := skate.OpenLibraryToRelease(&w) -	if err != nil { +	if release, err = skate.OpenLibraryToRelease(&w); err != nil {  		return nil, err  	}  	release.Extra.Skate.Status = "ol" -	b, err := json.Marshal(release) -	b = append(b, bytesNewline...) -	return b, err +	return skate.JsonMarshalNewline(release)  } diff --git a/skate/cmd/skate-from-unstructured/main.go b/skate/cmd/skate-from-unstructured/main.go index 179057d..2ccdd7d 100644 --- a/skate/cmd/skate-from-unstructured/main.go +++ b/skate/cmd/skate-from-unstructured/main.go @@ -28,7 +28,7 @@ func main() {  		if err := skate.ParseUnstructured(&ref); err != nil {  			return nil, err  		} -		return skate.JsonMarshalLine(&ref) +		return skate.JsonMarshalNewline(&ref)  	})  	pp.NumWorkers = *numWorkers  	pp.BatchSize = *batchSize  | 
