aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-conv/main.go
blob: 334da8114bba0c60fbbb64539b3fbc2b3ca8489d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
// skate-conv converts various schemas into releases. This should replace the
// very specific skate-ref-to-release and the like.
//
// $ skate-conv -f ref < FILE > FILE
//
// Currently source schemas: "ref", "ol", "rg"
package main

import (
	"flag"
	"log"
	"os"
	"runtime"
	"strings"

	"git.archive.org/martin/cgraph/skate"
	"git.archive.org/martin/cgraph/skate/parallel"
	"git.archive.org/martin/cgraph/skate/xio"
	json "github.com/segmentio/encoding/json"
)

var (
	numWorkers                    = flag.Int("w", runtime.NumCPU(), "number of workers")
	batchSize                     = flag.Int("b", 100000, "batch size")
	bestEffort                    = flag.Bool("B", false, "only log errors, do not halt")
	fromFormat                    = flag.String("f", "ref", "import schema: ref, rg, ol, oled")
	extraOpenLibraryAuthorMapping = flag.String("Xa", "", "TSV file, mapping OL author keys (e.g. to plain text names")

	f func([]byte) ([]byte, error) // our converter function

	// map OL author key to author name, e.g. via: zstdcat -T0
	// ol_dump_authors_latest.txt.zst | cut -f 5 | jq -rc '[.key, .name] |
	// @tsv'
	openLibraryAuthorMap = make(map[string]string, 8388608)
)

func main() {
	flag.Parse()
	switch *fromFormat {
	case "ref":
		f = refToRelease
	case "rg":
		f = rgSitemapToRelease
	case "ol":
		f = openLibraryToRelease
	case "oled":
		f = openLibraryEditionToRelease
		if *extraOpenLibraryAuthorMapping != "" {
			log.Printf("loading author mapping from %v ...", *extraOpenLibraryAuthorMapping)
			m, err := xio.TabsToMapFile(*extraOpenLibraryAuthorMapping, "\t", 1, 2)
			if err != nil {
				log.Fatal(err)
			}
			openLibraryAuthorMap = m
			log.Printf("found: %v", len(openLibraryAuthorMap))
		}
	default:
		log.Fatalf("unsupported input schema: %v", *fromFormat)
	}
	pp := parallel.NewProcessor(os.Stdin, os.Stdout, f)
	pp.NumWorkers = *numWorkers
	pp.BatchSize = *batchSize
	if err := pp.Run(); err != nil {
		log.Fatal(err)
	}
}

// refToRelease converts a ref document to a release. The standard conversion
// plus some extra fields.
func refToRelease(p []byte) ([]byte, error) {
	var (
		ref     skate.Ref
		release *skate.Release
		err     error
	)
	if err = json.Unmarshal(p, &ref); err != nil {
		if *bestEffort {
			log.Printf("failed to unmarshal: %v", string(p))
		} else {
			return nil, err
		}
	}
	if release, err = skate.RefToRelease(&ref); err != nil {
		return nil, err
	}
	release.Extra.Skate.Status = "ref" // means: converted from ref
	release.Extra.Skate.Ref.Index = ref.Index
	release.Extra.Skate.Ref.Key = ref.Key
	return skate.JsonMarshalNewline(release)
}

// rgSitemapToRelease converts a simple sitemap to a release entity, e.g. from
// https://archive.org/download/rg_sitemap_2021_02_23/rg_sitemap_2021_02_23.ndj.zst.
func rgSitemapToRelease(p []byte) ([]byte, error) {
	var (
		s       skate.SitemapEntry
		release skate.Release
		err     error
	)
	if err = json.Unmarshal(p, &s); err != nil {
		if *bestEffort {
			log.Printf("failed to unmarshal: %v", string(p))
		} else {
			return nil, err
		}
	}
	release.Title = s.Title
	if len(s.URL) > 41 {
		// A pseudo ident, maybe irritating; we want the "321885388".
		// https://www.researchgate.net/publication/321885388_We_came_here_on_dif
		release.Ident = strings.Split(s.URL[41:], "_")[0]
	}
	release.Extra.Skate.Status = "rg"
	release.Extra.Skate.ResearchGate.URL = s.URL
	return skate.JsonMarshalNewline(release)
}

// openLibraryToRelease converts an Open Library work item to a release.
func openLibraryToRelease(p []byte) ([]byte, error) {
	var (
		w       skate.OpenLibrarySolrDoc
		release *skate.Release
		err     error
	)
	if err = json.Unmarshal(p, &w); err != nil {
		if *bestEffort {
			log.Printf("failed to unmarshal: %v", string(p))
		} else {
			return nil, err
		}
	}
	if release, err = skate.OpenLibrarySolrDocToRelease(&w); err != nil {
		return nil, err
	}
	release.Extra.Skate.Status = "ol"
	return skate.JsonMarshalNewline(release)
}

// openLibraryEditionToRelease converts an Open Library edition item to a
// release.
func openLibraryEditionToRelease(p []byte) ([]byte, error) {
	var (
		w       skate.OpenLibraryEdition
		release *skate.Release
		err     error
	)
	if err = json.Unmarshal(p, &w); err != nil {
		if *bestEffort {
			log.Printf("failed to unmarshal: %v", string(p))
		} else {
			return nil, err
		}
	}
	if release, err = skate.OpenLibraryEditionToRelease(&w, openLibraryAuthorMap); err != nil {
		return nil, err
	}
	release.Extra.Skate.Status = "oled"
	return skate.JsonMarshalNewline(release)
}