// skate-conv converts various schemas into releases. This should replace the // very specific skate-ref-to-release and the like. // // $ skate-conv -f ref < FILE > FILE // // Currently source schemas: "ref", "ol", "rg" package main import ( "flag" "log" "os" "runtime" "strings" "git.archive.org/martin/cgraph/skate" "git.archive.org/martin/cgraph/skate/parallel" "git.archive.org/martin/cgraph/skate/xio" "github.com/segmentio/encoding/json" ) var ( numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") batchSize = flag.Int("b", 100000, "batch size") bestEffort = flag.Bool("B", false, "only log errors, do not halt") fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol, oled") extraOpenLibraryAuthorMapping = flag.String("Xa", "", "TSV file, mapping OL author keys (e.g. to plain text names") f func([]byte) ([]byte, error) // our converter function // map OL author key to author name, e.g. via: zstdcat -T0 // ol_dump_authors_latest.txt.zst | cut -f 5 | jq -rc '[.key, .name] | // @tsv' openLibraryAuthorMap = make(map[string]string, 8388608) ) func main() { flag.Parse() switch *fromFormat { case "ref": f = refToRelease case "rg": f = rgSitemapToRelease case "ol": f = openLibraryToRelease case "oled": f = openLibraryEditionToRelease if *extraOpenLibraryAuthorMapping != "" { log.Printf("loading author mapping from %v ...", *extraOpenLibraryAuthorMapping) m, err := xio.TabsToMapFile(*extraOpenLibraryAuthorMapping, "\t", 1, 2) if err != nil { log.Fatal(err) } openLibraryAuthorMap = m log.Printf("found: %v", len(openLibraryAuthorMap)) } default: log.Fatalf("unsupported input schema: %v", *fromFormat) } pp := parallel.NewProcessor(os.Stdin, os.Stdout, f) pp.NumWorkers = *numWorkers pp.BatchSize = *batchSize if err := pp.Run(); err != nil { log.Fatal(err) } } // refToRelease converts a ref document to a release. The standard conversion // plus some extra fields. func refToRelease(p []byte) ([]byte, error) { var ( ref skate.Ref release *skate.Release err error ) if err = json.Unmarshal(p, &ref); err != nil { if *bestEffort { log.Printf("failed to unmarshal: %v", string(p)) } else { return nil, err } } if release, err = skate.RefToRelease(&ref); err != nil { return nil, err } release.Extra.Skate.Status = "ref" // means: converted from ref release.Extra.Skate.Ref.Index = ref.Index release.Extra.Skate.Ref.Key = ref.Key return skate.JsonMarshalNewline(release) } // rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from // https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst. func rgSitemapToRelease(p []byte) ([]byte, error) { var ( s skate.SitemapEntry release skate.Release err error ) if err = json.Unmarshal(p, &s); err != nil { if *bestEffort { log.Printf("failed to unmarshal: %v", string(p)) } else { return nil, err } } release.Title = s.Title if len(s.URL) > 41 { // A pseudo ident, maybe irritating; we want the "321885388". // https://www.researchgate.net/publication/321885388_We_came_here_on_dif release.Ident = strings.Split(s.URL[41:], "_")[0] } release.Extra.Skate.Status = "rg" release.Extra.Skate.ResearchGate.URL = s.URL return skate.JsonMarshalNewline(release) } // openLibraryToRelease converts an Open Library work item to a release. func openLibraryToRelease(p []byte) ([]byte, error) { var ( w skate.OpenLibrarySolrDoc release *skate.Release err error ) if err = json.Unmarshal(p, &w); err != nil { if *bestEffort { log.Printf("failed to unmarshal: %v", string(p)) } else { return nil, err } } if release, err = skate.OpenLibrarySolrDocToRelease(&w); err != nil { return nil, err } release.Extra.Skate.Status = "ol" return skate.JsonMarshalNewline(release) } // openLibraryEditionToRelease converts an Open Library edition item to a // release. func openLibraryEditionToRelease(p []byte) ([]byte, error) { var ( w skate.OpenLibraryEdition release *skate.Release err error ) if err = json.Unmarshal(p, &w); err != nil { if *bestEffort { log.Printf("failed to unmarshal: %v", string(p)) } else { return nil, err } } if release, err = skate.OpenLibraryEditionToRelease(&w, openLibraryAuthorMap); err != nil { return nil, err } release.Extra.Skate.Status = "oled" return skate.JsonMarshalNewline(release) }