package skate import ( "bytes" "errors" "reflect" "runtime" "strings" json "github.com/segmentio/encoding/json" "github.com/tidwall/gjson" ) var ( bTab = []byte("\t") bNewline = []byte("\n") ErrZeroFields = errors.New("zero fields") ErrMissingFieldName = errors.New("missing field name") ) // Title is a document with a title. type TitleDoc struct { Title string `json:"title"` } // PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699 // (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on. type PartialDoc struct { ContainerName string `json:"container_name"` Contribs []struct { RawName string `json:"raw_name"` } `json:"contribs"` Volume string `json:"volume"` Unstructured string `json:"unstructured"` Year string `json:"release_year"` } // Mapper maps a blob to an arbitrary number of fields, e.g. for (key, // doc). We want fields, but we do not want to bake in TSV into each function. type Mapper func([]byte) ([][]byte, error) // AsTSV serializes the result of a field mapper as TSV. This is a slim adapter, // e.g. to parallel.Processor, which expects this function signature. func (f Mapper) AsTSV(p []byte) ([]byte, error) { fields, err := f(p) if err != nil { return nil, err } return bytes.Join(fields, bTab), nil } // WithPrefix adds a given prefix to the first element. func WithPrefix(f Mapper, prefix string) Mapper { return func(p []byte) ([][]byte, error) { fields, err := f(p) if err != nil { return fields, err } if len(fields) == 0 { return nil, ErrZeroFields } fields[0] = append([]byte(prefix+":"), fields[0]...) return fields, err } } // NameOf returns name of value, e.g. the name of a function. func NameOf(f interface{}) string { v := reflect.ValueOf(f) if v.Kind() == reflect.Func { if rf := runtime.FuncForPC(v.Pointer()); rf != nil { return rf.Name() } } return v.String() } // Identifier returns just the input again. func Identity(p []byte) ([][]byte, error) { return [][]byte{p}, nil } // CreateFixedMapper extract the value from a given fixed top level json key. // Returns a function that maps doc to (v, doc). func CreateFixedMapper(field string) Mapper { f := func(p []byte) ([][]byte, error) { result := gjson.GetBytes(p, field) key := []byte(result.String()) return [][]byte{key, p}, nil } return f } // MapperTitle extracts (title, doc). func MapperTitle(p []byte) ([][]byte, error) { var ( doc TitleDoc key []byte ) if err := json.Unmarshal(p, &doc); err != nil { return nil, err } else { key = []byte(wsReplacer.Replace(strings.TrimSpace(doc.Title))) } return [][]byte{key, p}, nil } // MapperTitleNormalized extracts (title normalized, doc). func MapperTitleNormalized(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err } key := string(fields[0]) key = wsReplacer.Replace(strings.TrimSpace(key)) key = strings.ToLower(key) key = repeatedWs.ReplaceAllString(key, " ") key = nonWord.ReplaceAllString(key, "") fields[0] = []byte(key) return fields, nil } // MapperTitleNormalized extracts (title nysiis, doc). func MapperTitleNysiis(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err } key := string(fields[0]) key = wsReplacer.Replace(strings.TrimSpace(key)) key = NYSIIS(key) fields[0] = []byte(key) return fields, nil } // MapperTitleSandcrawler extracts (title sandcrawler, doc). func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) { if fields, err = MapperTitle(p); err != nil { return nil, err } key := string(fields[0]) key = sandcrawlerSlugify(wsReplacer.Replace(strings.TrimSpace(key))) fields[0] = []byte(key) return fields, nil } // MapperPartial works on partial documents. func MapperPartial(p []byte) (fields [][]byte, err error) { // XXX: slugify authors, how to compare two author strings? How do these // things look like? return nil, nil }