diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-26 22:47:05 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-26 22:47:05 +0200 |
commit | fff30b71abf23222f1dbc7e45591cdd093bf85d1 (patch) | |
tree | a38e8856afc0e46fab0a3067ab3141969b13a516 | |
parent | 57826605209de687e0b6e6cb151021b7bcf034ca (diff) | |
download | refcat-fff30b71abf23222f1dbc7e45591cdd093bf85d1.tar.gz refcat-fff30b71abf23222f1dbc7e45591cdd093bf85d1.zip |
add author key mapping
-rw-r--r-- | skate/cmd/skate-conv/main.go | 26 | ||||
-rw-r--r-- | skate/schema.go | 19 | ||||
-rw-r--r-- | skate/xio/util.go | 36 |
3 files changed, 72 insertions, 9 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go index d48c913..3627c67 100644 --- a/skate/cmd/skate-conv/main.go +++ b/skate/cmd/skate-conv/main.go @@ -19,11 +19,17 @@ import ( ) var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 100000, "batch size") - fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol, oled") + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol, oled") + extraOpenLibraryAuthorMapping = flag.String("Xa", "", "TSV file, mapping OL author keys (e.g. to plain text names") f func([]byte) ([]byte, error) // our converter function + + // map OL author key to author name, e.g. via: zstdcat -T0 + // ol_dump_authors_latest.txt.zst | cut -f 5 | jq -rc '[.key, .name] | + // @tsv' + openLibraryAuthorMap = make(map[string]string) ) func main() { @@ -37,6 +43,18 @@ func main() { f = openLibraryToRelease case "oled": f = openLibraryEditionToRelease + if *extraOpenLibraryAuthorMapping != "" { + f, err := os.Open(*extraOpenLibraryAuthorMapping) + if err != nil { + log.Fatal(err) + } + defer f.Close() + m, err := skate.TabsToMap(f, "\t", 1, 2) + if err != nil { + log.Fatal(err) + } + openLibraryAuthorMap = m + } default: log.Fatalf("unsupported input schema: %v", *fromFormat) } @@ -118,7 +136,7 @@ func openLibraryEditionToRelease(p []byte) ([]byte, error) { if err = json.Unmarshal(p, &w); err != nil { return nil, err } - if release, err = skate.OpenLibraryEditionToRelease(&w); err != nil { + if release, err = skate.OpenLibraryEditionToRelease(&w, openLibraryAuthorMap); err != nil { return nil, err } release.Extra.Skate.Status = "oled" diff --git a/skate/schema.go b/skate/schema.go index fda2e7a..92a66f0 100644 --- a/skate/schema.go +++ b/skate/schema.go @@ -530,8 +530,23 @@ func (v OpenLibraryEdition) Isbns() []string { } // OpenLibraryEditionToRelease convert OL data into a release. XXX: release/work? -func OpenLibraryEditionToRelease(v *OpenLibraryEdition) (*Release, error) { - var release = Release{} +func OpenLibraryEditionToRelease(v *OpenLibraryEdition, authorMap map[string]string) (*Release, error) { + var ( + release Release + contribs = make([]struct { + Index int `json:"index,omitempty"` + RawName string `json:"raw_name,omitempty"` + Role string `json:"role,omitempty"` + }, len(v.Authors)) + ) + for i, author := range v.Authors { + name, ok := authorMap[author.Key] + if !ok { + continue + } + contribs[i].RawName = name + } + release.Contribs = contribs release.Title = v.Title release.ExtIDs.ISBN = v.Isbns() if len(v.Publishers) > 0 { diff --git a/skate/xio/util.go b/skate/xio/util.go index 554317b..c0439e5 100644 --- a/skate/xio/util.go +++ b/skate/xio/util.go @@ -1,9 +1,14 @@ package xio -import "os" +import ( + "bufio" + "io" + "os" + "strings" +) -// OpenTwo opens two files, and the caller needs to check for a single error only. -func OpenTwo(f1, f2 string) (g1 *os.File, g2 *os.File, err error) { +// OpenTwo opens two files. The caller needs to check for a single error only. +func OpenTwo(f1, f2 string) (g1, g2 *os.File, err error) { if g1, err = os.Open(f1); err != nil { return nil, nil, err } @@ -12,3 +17,28 @@ func OpenTwo(f1, f2 string) (g1 *os.File, g2 *os.File, err error) { } return g1, g2, nil } + +// TabsToMap read from a reader and turns values from kCol, vCol columns into a +// mapping. +func TabsToMap(r io.Reader, sep string, kCol, vCol int) (map[string]string, error) { + var ( + br = bufio.NewReader(r) + m = make(map[string]string) + ) + for { + line, err := br.ReadString('\n') + if err == io.EOF { + return m, nil + } + if err != nil { + return nil, err + } + fields := strings.Split(line, sep) + if len(fields) <= kCol && len(fields) <= vCol { + k := strings.TrimSpace(fields[kCol-1]) + v := strings.TrimSpace(fields[vCol-1]) + m[k] = v + } + } + return m, nil +} |