diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-24 21:42:22 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-24 21:42:22 +0200 |
commit | 7dba9e88abe328e59da73d475b17d627cd190901 (patch) | |
tree | 360f58d77976f6752af8f1ff7b6caea6f07cf909 /skate | |
parent | 8b18e4311aceae98f730dc655c24ba72494dc9ae (diff) | |
download | refcat-7dba9e88abe328e59da73d475b17d627cd190901.tar.gz refcat-7dba9e88abe328e59da73d475b17d627cd190901.zip |
add two new mappers; OL related
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-map/main.go | 19 | ||||
-rw-r--r-- | skate/map.go | 32 |
2 files changed, 42 insertions, 9 deletions
diff --git a/skate/cmd/skate-map/main.go b/skate/cmd/skate-map/main.go index 6f3acb9..132d4ab 100644 --- a/skate/cmd/skate-map/main.go +++ b/skate/cmd/skate-map/main.go @@ -65,14 +65,17 @@ var ( func main() { flag.Parse() availableMappers := map[string]skate.Mapper{ - // Add new mapper functions here. TODO: add more docs. - "id": skate.Identity, - "ff": skate.CreateFixedMapper(*extraValue), - "ti": skate.MapperTitle, - "tn": skate.MapperTitleNormalized, - "ty": skate.MapperTitleNysiis, - "ts": skate.MapperTitleSandcrawler, - "ru": skate.MapperURLFromRef, + // Add new mapper functions here. TODO: add more docs, and improve + // composability, e.g. like middleware. + "id": skate.Identity, + "ff": skate.CreateFixedMapper(*extraValue), + "ti": skate.MapperTitle, + "tn": skate.MapperTitleNormalized, + "ty": skate.MapperTitleNysiis, + "ts": skate.MapperTitleSandcrawler, + "ru": skate.MapperURLFromRef, + "cti": skate.MapperContainerTitle, + "cts": skate.MapperContainerTitleSandcrawler, } if *logFile != "" { f, err := os.OpenFile(*logFile, os.O_CREATE|os.O_APPEND, 0644) diff --git a/skate/map.go b/skate/map.go index 27bc627..737c9af 100644 --- a/skate/map.go +++ b/skate/map.go @@ -19,11 +19,16 @@ var ( ErrMissingFieldName = errors.New("missing field name") ) -// Title is a document with a title. +// TitleDoc is a document with a title. type TitleDoc struct { Title string `json:"title"` } +// ContainerTitleDoc is a document with a title. +type ContainerTitleDoc struct { + ContainerTitle string `json:"container_title"` +} + // PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699 // (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. Some examples: XXX type PartialDoc struct { @@ -190,6 +195,31 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { return fields, nil } +// MapperContainerTitle extracts (container_title, doc). +func MapperContainerTitle(p []byte) ([][]byte, error) { + var ( + doc ContainerTitleDoc + key []byte + ) + if err := json.Unmarshal(p, &doc); err != nil { + return nil, err + } else { + key = []byte(wsReplacer.Replace(strings.TrimSpace(doc.ContainerTitle))) + } + return [][]byte{key, p}, nil +} + +// MapperContainerTitleSandcrawler extracts (container_title, doc). +func MapperContainerTitleSandcrawler(p []byte) (fields [][]byte, err error) { + if fields, err = MapperContainerTitle(p); err != nil { + return nil, err + } + key := string(fields[0]) + key = sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(key))) + fields[0] = []byte(key) + return fields, nil +} + // MapperURLFromRef extracts the (work ident, release ident, url, doc). // Previously: parallel -j 16 --block 100M --pipe "jq -rc '[.work_ident, // .release_ident, .biblio.url?] | @tsv'" ... |