package skate import ( "fmt" "log" "strings" "github.com/segmentio/encoding/json" "mvdan.cc/xurls/v2" ) // TODO: This should be revisited entirely. var rxRelaxed = xurls.Relaxed() // URLFilter is a line oriented URL filter. type FilterURL struct { Delimiter string Index int BestEffort bool Aggressive bool SkipNonMatches bool AllowedSchemas []string } // Run executes the filter on a blob of data, most likely a line. func (f *FilterURL) Run(p []byte) ([]byte, error) { parts := strings.Split(string(p), f.Delimiter) if len(parts) < f.Index { msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) if f.BestEffort { log.Println(msg) return nil, nil } else { return nil, fmt.Errorf(msg) } } url := rxRelaxed.FindString(parts[f.Index-1]) if f.Aggressive { url = SanitizeURL(url) } if url == "" && f.SkipNonMatches { return nil, nil } if len(f.AllowedSchemas) > 0 && !HasAnyPrefix(url, f.AllowedSchemas) { return nil, nil } if len(parts) == 1 || f.Index == len(parts) { url = url + "\n" } parts[f.Index-1] = url return []byte(strings.Join(parts, f.Delimiter)), nil } // FilterDOI is a line oriented DOI filter. type FilterDOI struct { Delimiter string Index int BestEffort bool Aggressive bool SkipNonMatches bool AllowedSchema []string } // Run executes the filter on a blob of data, most likely a line. func (f *FilterDOI) Run(p []byte) ([]byte, error) { parts := strings.Split(string(p), f.Delimiter) if len(parts) < f.Index { msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) if f.BestEffort { log.Println(msg) return nil, nil } else { return nil, fmt.Errorf(msg) } } doi := PatDOI.FindString(parts[f.Index-1]) if doi == "" && f.SkipNonMatches { return nil, nil } parts[f.Index-1] = strings.ToLower(doi) return []byte(strings.Join(parts, f.Delimiter)), nil } // FilterRawRef is an ad-hoc filter. type FilterRawRef struct{} // Run executes the filter. TODO: Gather cleanup functions together and make // them more easily shared. func (f *FilterRawRef) Run(p []byte) ([]byte, error) { var ref Ref if err := json.Unmarshal(p, &ref); err != nil { return nil, err } if strings.Contains(ref.Biblio.Unstructured, "................") { return nil, nil } return p, nil }