aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--skate/cmd/skate-map/main.go19
-rw-r--r--skate/map.go32
2 files changed, 42 insertions, 9 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go
index 6f3acb9..132d4ab 100644
--- a/skate/cmd/skate-map/main.go
+++ b/skate/cmd/skate-map/main.go
@@ -65,14 +65,17 @@ var (
func main() {
flag.Parse()
availableMappers := map[string]skate.Mapper{
- // Add new mapper functions here. TODO: add more docs.
- "id": skate.Identity,
- "ff": skate.CreateFixedMapper(*extraValue),
- "ti": skate.MapperTitle,
- "tn": skate.MapperTitleNormalized,
- "ty": skate.MapperTitleNysiis,
- "ts": skate.MapperTitleSandcrawler,
- "ru": skate.MapperURLFromRef,
+ // Add new mapper functions here. TODO: add more docs, and improve
+ // composability, e.g. like middleware.
+ "id": skate.Identity,
+ "ff": skate.CreateFixedMapper(*extraValue),
+ "ti": skate.MapperTitle,
+ "tn": skate.MapperTitleNormalized,
+ "ty": skate.MapperTitleNysiis,
+ "ts": skate.MapperTitleSandcrawler,
+ "ru": skate.MapperURLFromRef,
+ "cti": skate.MapperContainerTitle,
+ "cts": skate.MapperContainerTitleSandcrawler,
}
if *logFile != "" {
f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644)
diff --git a/skate/map.go b/skate/map.go
index 27bc627..737c9af 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -19,11 +19,16 @@ var (
ErrMissingFieldName = errors.New("missing field name")
)
-// Title is a document with a title.
+// TitleDoc is a document with a title.
type TitleDoc struct {
Title string `json:"title"`
}
+// ContainerTitleDoc is a document with a title.
+type ContainerTitleDoc struct {
+ ContainerTitle string `json:"container_title"`
+}
+
// PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699
// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. Some examples: XXX
type PartialDoc struct {
@@ -190,6 +195,31 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) {
return fields, nil
}
+// MapperContainerTitle extracts (container_title, doc).
+func MapperContainerTitle(p []byte) ([][]byte, error) {
+ var (
+ doc ContainerTitleDoc
+ key []byte
+ )
+ if err := json.Unmarshal(p, &doc); err != nil {
+ return nil, err
+ } else {
+ key = []byte(wsReplacer.Replace(strings.TrimSpace(doc.ContainerTitle)))
+ }
+ return [][]byte{key, p}, nil
+}
+
+// MapperContainerTitleSandcrawler extracts (container_title, doc).
+func MapperContainerTitleSandcrawler(p []byte) (fields [][]byte, err error) {
+ if fields, err = MapperContainerTitle(p); err != nil {
+ return nil, err
+ }
+ key := string(fields[0])
+ key = sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(key)))
+ fields[0] = []byte(key)
+ return fields, nil
+}
+
// MapperURLFromRef extracts the (work ident, release ident, url, doc).
// Previously: parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident,
// .release_ident, .biblio.url?] | @tsv'" ...