package skate import ( "bytes" "reflect" "runtime" "strconv" "strings" json "github.com/segmentio/encoding/json" ) var ( bTab = []byte("\t") bNewline = []byte("\n") ) type TitleDoc struct { Title string `json:"title"` } // PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699 // (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. type PartialDoc struct { ContainerName string `json:"container_name"` Contribs []struct { RawName string `json:"raw_name"` } `json:"contribs"` Volume string `json:"volume"` Unstructured string `json:"unstructured"` Year string `json:"release_year"` } // Mapper maps a blob to an arbitrary number of fields, e.g. for (key, // doc). We want fields, but we do not want to bake in TSV into each function. type Mapper func([]byte) ([][]byte, error) // TSV serialized the result of a field mapper as TSV. This is a slim adapter, // e.g. to parallel.Processor, which expects this function signature. func (f Mapper) TSV(p []byte) ([]byte, error) { fields, err := f(p) if err != nil { return nil, err } return append(bytes.Join(fields, bTab), bNewline...), nil } // NameOf returns name of value, e.g. the name of a function. func NameOf(f interface{}) string { v := reflect.ValueOf(f) if v.Kind() == reflect.Func { if rf := runtime.FuncForPC(v.Pointer()); rf != nil { return rf.Name() } } return v.String() } // Identifier returns just the input again. func Identity(p []byte) ([][]byte, error) { return [][]byte{p}, nil } // CreateFixedMapper extract the value from a given fixed top level json key. // Returns a function that maps doc to (v, doc). func CreateFixedMapper(field string) Mapper { f := func(p []byte) ([][]byte, error) { var ( doc map[string]interface{} v interface{} ok bool key []byte ) if err := json.Unmarshal(p, &doc); err != nil { return nil, err } if v, ok = doc[field]; !ok { return nil, nil } switch w := v.(type) { case string: key = []byte(w) case int: key = []byte(strconv.Itoa(w)) case int64: key = []byte(strconv.Itoa(int(w))) case float64: key = []byte(strconv.FormatFloat(w, 'f', 52, 64)) default: return nil, nil } return [][]byte{key, p}, nil } return f } // MapperTitle extracts (title, doc). func MapperTitle(p []byte) ([][]byte, error) { var ( doc TitleDoc key []byte ) if err := json.Unmarshal(p, &doc); err != nil { return nil, err } else { key = []byte(wsReplacer.Replace(strings.TrimSpace(doc.Title))) } return [][]byte{key, p}, nil } // MapperTitleNormalized extracts (title normalized, doc). func MapperTitleNormalized(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err } key := string(fields[0]) key = wsReplacer.Replace(strings.TrimSpace(key)) key = strings.ToLower(key) key = repeatedWs.ReplaceAllString(key, " ") key = nonWord.ReplaceAllString(key, "") fields[0] = []byte(key) return fields, nil } // MapperTitleNormalized extracts (title nysiis, doc). func MapperTitleNysiis(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err } key := string(fields[0]) key = wsReplacer.Replace(strings.TrimSpace(key)) key = NYSIIS(key) fields[0] = []byte(key) return fields, nil } // MapperTitleSandcrawler extracts (title sandcrawler, doc). func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err } key := string(fields[0]) key = sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(key))) fields[0] = []byte(key) return fields, nil } // MapperPartial works on partial documents. func MapperPartial(p []byte) (fields [][]byte, err error) { // XXX: slugify authors, how to compare two author strings? How do these // things look like? return nil, nil }