1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
|
// skate-conv converts various schemas into releases. This should replace the
// very specific skate-ref-to-release and the like.
//
// $ skate-conv -f ref < FILE > FILE
//
// Currently source schemas: "ref", "ol", "rg"
package main
import (
"flag"
"log"
"os"
"runtime"
"strings"
"github.com/segmentio/encoding/json"
"gitlab.com/internetarchive/refcat/skate"
"gitlab.com/internetarchive/refcat/skate/parallel"
"gitlab.com/internetarchive/refcat/skate/xio"
)
var (
numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
batchSize = flag.Int("b", 100000, "batch size")
bestEffort = flag.Bool("B", false, "only log errors, do not halt")
fromFormat = flag.String("f", "ref", "import schema: ref, rg, ol, oled")
extraOpenLibraryAuthorMapping = flag.String("Xa", "", "TSV file, mapping OL author keys (e.g. to plain text names")
f func([]byte) ([]byte, error) // our converter function
// map OL author key to author name, e.g. via: zstdcat -T0
// ol_dump_authors_latest.txt.zst | cut -f 5 | jq -rc '[.key, .name] |
// @tsv'
openLibraryAuthorMap = make(map[string]string, 8388608)
)
func main() {
flag.Parse()
switch *fromFormat {
case "ref":
f = refToRelease
case "rg":
f = rgSitemapToRelease
case "ol":
f = openLibraryToRelease
case "oled":
f = openLibraryEditionToRelease
if *extraOpenLibraryAuthorMapping != "" {
log.Printf("loading author mapping from %v ...", *extraOpenLibraryAuthorMapping)
f, err := os.Open(*extraOpenLibraryAuthorMapping)
if err != nil {
log.Fatal(err)
}
defer f.Close()
m, err := xio.MapFromTabular(f, "\t", 1, 2)
if err != nil {
log.Fatal(err)
}
openLibraryAuthorMap = m
log.Printf("found: %v", len(openLibraryAuthorMap))
}
default:
log.Fatalf("unsupported input schema: %v", *fromFormat)
}
pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
pp.NumWorkers = *numWorkers
pp.BatchSize = *batchSize
if err := pp.Run(); err != nil {
log.Fatal(err)
}
}
// refToRelease converts a ref document to a release. The standard conversion
// plus some extra fields.
func refToRelease(p []byte) ([]byte, error) {
var (
ref skate.Ref
release *skate.Release
err error
)
if err = json.Unmarshal(p, &ref); err != nil {
if *bestEffort {
log.Printf("failed to unmarshal: %v", string(p))
} else {
return nil, err
}
}
if release, err = skate.RefToRelease(&ref); err != nil {
return nil, err
}
release.Extra.Skate.Status = "ref" // means: converted from ref
release.Extra.Skate.Ref.Index = ref.Index
release.Extra.Skate.Ref.Key = ref.Key
return skate.JsonMarshalNewline(release)
}
// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from
// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst.
func rgSitemapToRelease(p []byte) ([]byte, error) {
var (
s skate.SitemapEntry
release skate.Release
err error
)
if err = json.Unmarshal(p, &s); err != nil {
if *bestEffort {
log.Printf("failed to unmarshal: %v", string(p))
} else {
return nil, err
}
}
release.Title = s.Title
if len(s.URL) > 41 {
// A pseudo ident, maybe irritating; we want the "321885388".
// https://www.researchgate.net/publication/321885388_We_came_here_on_dif
release.Ident = strings.Split(s.URL[41:], "_")[0]
}
release.Extra.Skate.Status = "rg"
release.Extra.Skate.ResearchGate.URL = s.URL
return skate.JsonMarshalNewline(release)
}
// openLibraryToRelease converts an Open Library work item to a release.
func openLibraryToRelease(p []byte) ([]byte, error) {
var (
w skate.OpenLibrarySolrDoc
release *skate.Release
err error
)
if err = json.Unmarshal(p, &w); err != nil {
if *bestEffort {
log.Printf("failed to unmarshal: %v", string(p))
} else {
return nil, err
}
}
if release, err = skate.OpenLibrarySolrDocToRelease(&w); err != nil {
return nil, err
}
release.Extra.Skate.Status = "ol"
return skate.JsonMarshalNewline(release)
}
// openLibraryEditionToRelease converts an Open Library edition item to a
// release.
func openLibraryEditionToRelease(p []byte) ([]byte, error) {
var (
w skate.OpenLibraryEdition
release *skate.Release
err error
)
if err = json.Unmarshal(p, &w); err != nil {
if *bestEffort {
log.Printf("failed to unmarshal: %v", string(p))
} else {
return nil, err
}
}
if release, err = skate.OpenLibraryEditionToRelease(&w, openLibraryAuthorMap); err != nil {
return nil, err
}
release.Extra.Skate.Status = "oled"
return skate.JsonMarshalNewline(release)
}
|