aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-05-26 22:47:05 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-05-26 22:47:05 +0200
commitfff30b71abf23222f1dbc7e45591cdd093bf85d1 (patch)
treea38e8856afc0e46fab0a3067ab3141969b13a516
parent57826605209de687e0b6e6cb151021b7bcf034ca (diff)
downloadrefcat-fff30b71abf23222f1dbc7e45591cdd093bf85d1.tar.gz
refcat-fff30b71abf23222f1dbc7e45591cdd093bf85d1.zip
add author key mapping
-rw-r--r--skate/cmd/skate-conv/main.go26
-rw-r--r--skate/schema.go19
-rw-r--r--skate/xio/util.go36
3 files changed, 72 insertions, 9 deletions
diff --git a/skate/cmd/skate-conv/main.go b/skate/cmd/skate-conv/main.go
index d48c913..3627c67 100644
--- a/skate/cmd/skate-conv/main.go
+++ b/skate/cmd/skate-conv/main.go
@@ -19,11 +19,17 @@ import (
)
var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 100000, "batch size")
- fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol, oled")
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol, oled")
+ extraOpenLibraryAuthorMapping = flag.String("Xa", "", "TSV file, mapping OL author keys (e.g. to plain text names")
f func([]byte) ([]byte, error) // our converter function
+
+ // map OL author key to author name, e.g. via: zstdcat -T0
+ // ol_dump_authors_latest.txt.zst | cut -f 5 | jq -rc '[.key, .name] |
+ // @tsv'
+ openLibraryAuthorMap = make(map[string]string)
)
func main() {
@@ -37,6 +43,18 @@ func main() {
f = openLibraryToRelease
case "oled":
f = openLibraryEditionToRelease
+ if *extraOpenLibraryAuthorMapping != "" {
+ f, err := os.Open(*extraOpenLibraryAuthorMapping)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer f.Close()
+ m, err := skate.TabsToMap(f, "\t", 1, 2)
+ if err != nil {
+ log.Fatal(err)
+ }
+ openLibraryAuthorMap = m
+ }
default:
log.Fatalf("unsupported input schema: %v", *fromFormat)
}
@@ -118,7 +136,7 @@ func openLibraryEditionToRelease(p []byte) ([]byte, error) {
if err = json.Unmarshal(p, &w); err != nil {
return nil, err
}
- if release, err = skate.OpenLibraryEditionToRelease(&w); err != nil {
+ if release, err = skate.OpenLibraryEditionToRelease(&w, openLibraryAuthorMap); err != nil {
return nil, err
}
release.Extra.Skate.Status = "oled"
diff --git a/skate/schema.go b/skate/schema.go
index fda2e7a..92a66f0 100644
--- a/skate/schema.go
+++ b/skate/schema.go
@@ -530,8 +530,23 @@ func (v OpenLibraryEdition) Isbns() []string {
}
// OpenLibraryEditionToRelease convert OL data into a release. XXX: release/work?
-func OpenLibraryEditionToRelease(v *OpenLibraryEdition) (*Release, error) {
- var release = Release{}
+func OpenLibraryEditionToRelease(v *OpenLibraryEdition, authorMap map[string]string) (*Release, error) {
+ var (
+ release Release
+ contribs = make([]struct {
+ Index int `json:"index,omitempty"`
+ RawName string `json:"raw_name,omitempty"`
+ Role string `json:"role,omitempty"`
+ }, len(v.Authors))
+ )
+ for i, author := range v.Authors {
+ name, ok := authorMap[author.Key]
+ if !ok {
+ continue
+ }
+ contribs[i].RawName = name
+ }
+ release.Contribs = contribs
release.Title = v.Title
release.ExtIDs.ISBN = v.Isbns()
if len(v.Publishers) > 0 {
diff --git a/skate/xio/util.go b/skate/xio/util.go
index 554317b..c0439e5 100644
--- a/skate/xio/util.go
+++ b/skate/xio/util.go
@@ -1,9 +1,14 @@
package xio
-import "os"
+import (
+ "bufio"
+ "io"
+ "os"
+ "strings"
+)
-// OpenTwo opens two files, and the caller needs to check for a single error only.
-func OpenTwo(f1, f2 string) (g1 *os.File, g2 *os.File, err error) {
+// OpenTwo opens two files. The caller needs to check for a single error only.
+func OpenTwo(f1, f2 string) (g1, g2 *os.File, err error) {
if g1, err = os.Open(f1); err != nil {
return nil, nil, err
}
@@ -12,3 +17,28 @@ func OpenTwo(f1, f2 string) (g1 *os.File, g2 *os.File, err error) {
}
return g1, g2, nil
}
+
+// TabsToMap read from a reader and turns values from kCol, vCol columns into a
+// mapping.
+func TabsToMap(r io.Reader, sep string, kCol, vCol int) (map[string]string, error) {
+ var (
+ br = bufio.NewReader(r)
+ m = make(map[string]string)
+ )
+ for {
+ line, err := br.ReadString('\n')
+ if err == io.EOF {
+ return m, nil
+ }
+ if err != nil {
+ return nil, err
+ }
+ fields := strings.Split(line, sep)
+ if len(fields) <= kCol && len(fields) <= vCol {
+ k := strings.TrimSpace(fields[kCol-1])
+ v := strings.TrimSpace(fields[vCol-1])
+ m[k] = v
+ }
+ }
+ return m, nil
+}