aboutsummaryrefslogtreecommitdiffstats
path: root/skate/map.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/map.go')
-rw-r--r--skate/map.go32
1 files changed, 31 insertions, 1 deletions
diff --git a/skate/map.go b/skate/map.go
index 27bc627..737c9af 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -19,11 +19,16 @@ var (
ErrMissingFieldName = errors.New("missing field name")
)
-// Title is a document with a title.
+// TitleDoc is a document with a title.
type TitleDoc struct {
Title string `json:"title"`
}
+// ContainerTitleDoc is a document with a title.
+type ContainerTitleDoc struct {
+ ContainerTitle string `json:"container_title"`
+}
+
// PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699
// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. Some examples: XXX
type PartialDoc struct {
@@ -190,6 +195,31 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) {
return fields, nil
}
+// MapperContainerTitle extracts (container_title, doc).
+func MapperContainerTitle(p []byte) ([][]byte, error) {
+ var (
+ doc ContainerTitleDoc
+ key []byte
+ )
+ if err := json.Unmarshal(p, &doc); err != nil {
+ return nil, err
+ } else {
+ key = []byte(wsReplacer.Replace(strings.TrimSpace(doc.ContainerTitle)))
+ }
+ return [][]byte{key, p}, nil
+}
+
+// MapperContainerTitleSandcrawler extracts (container_title, doc).
+func MapperContainerTitleSandcrawler(p []byte) (fields [][]byte, err error) {
+ if fields, err = MapperContainerTitle(p); err != nil {
+ return nil, err
+ }
+ key := string(fields[0])
+ key = sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(key)))
+ fields[0] = []byte(key)
+ return fields, nil
+}
+
// MapperURLFromRef extracts the (work ident, release ident, url, doc).
// Previously: parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident,
// .release_ident, .biblio.url?] | @tsv'" ...