diff options
-rw-r--r-- | skate/map.go | 36 |
1 files changed, 30 insertions, 6 deletions
diff --git a/skate/map.go b/skate/map.go index 571a297..9cad1e4 100644 --- a/skate/map.go +++ b/skate/map.go @@ -19,14 +19,24 @@ type TitleDoc struct { Title string `json:"title"` } -// Mapper converts a blob. -type Mapper func([]byte) ([]byte, error) +// PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699 +// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. +type PartialDoc struct { + ContainerName string `json:"container_name"` + Contribs []struct { + RawName string `json:"raw_name"` + } `json:"contribs"` + Volume string `json:"volume"` + Unstructured string `json:"unstructured"` + Year string `json:"release_year"` +} // FieldMapper maps a blob to an arbitrary number of fields, e.g. for (key, -// doc) etc. +// doc). We want fields, but we do not want to bake in TSV into each function. type FieldMapper func([]byte) ([][]byte, error) -// TSV serialized the result of a field mapper as TSV. +// TSV serialized the result of a field mapper as TSV. This is a slim adapter, +// e.g. to parallel.Processor, which expects this function signature. func (f FieldMapper) TSV(p []byte) ([]byte, error) { fields, err := f(p) if err != nil { @@ -46,11 +56,14 @@ func NameOf(f interface{}) string { return v.String() } +// Identifier returns just the input again. func Identity(p []byte) ([][]byte, error) { return [][]byte{p}, nil } -func CreateFixedMapper(path string) FieldMapper { +// CreateFixedMapper extract the value from a given fixed top level json key. +// Returns a function that maps doc to (v, doc). +func CreateFixedMapper(field string) FieldMapper { f := func(p []byte) ([][]byte, error) { var ( doc map[string]interface{} @@ -61,7 +74,7 @@ func CreateFixedMapper(path string) FieldMapper { if err := json.Unmarshal(p, &doc); err != nil { return nil, err } - if v, ok = doc[path]; !ok { + if v, ok = doc[field]; !ok { return nil, nil } switch w := v.(type) { @@ -81,6 +94,7 @@ func CreateFixedMapper(path string) FieldMapper { return f } +// MapperTitle extracts (title, doc). func MapperTitle(p []byte) ([][]byte, error) { var ( doc TitleDoc @@ -94,6 +108,7 @@ func MapperTitle(p []byte) ([][]byte, error) { return [][]byte{key, p}, nil } +// MapperTitleNormalized extracts (title normalized, doc). func MapperTitleNormalized(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err @@ -107,6 +122,7 @@ func MapperTitleNormalized(p []byte) (fields [][]byte, err error) { return fields, nil } +// MapperTitleNormalized extracts (title nysiis, doc). func MapperTitleNysiis(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err @@ -118,6 +134,7 @@ func MapperTitleNysiis(p []byte) (fields [][]byte, err error) { return fields, nil } +// MapperTitleSandcrawler extracts (title sandcrawler, doc). func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err @@ -127,3 +144,10 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { fields[0] = []byte(key) return fields, nil } + +// MapperPartial works on partial documents. +func MapperPartial(p []byte) (fields [][]byte, err error) { + // XXX: slugify authors, how to compare two author strings? How do these + // things look like? + return nil, nil +} |