diff options
Diffstat (limited to 'skate/cleanup.go')
-rw-r--r-- | skate/cleanup.go | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/skate/cleanup.go b/skate/cleanup.go new file mode 100644 index 0000000..b50e048 --- /dev/null +++ b/skate/cleanup.go @@ -0,0 +1,78 @@ +package skate + +import ( + "fmt" + "log" + "strings" + + "mvdan.cc/xurls/v2" +) + +var rxRelaxed = xurls.Relaxed() + +// URLFilter is a line oriented URL filter. +type FilterURL struct { + Delimiter string + Index int + BestEffort bool + Aggressive bool + SkipNonMatches bool + AllowedSchemas []string +} + +func (f *FilterURL) Run(p []byte) ([]byte, error) { + parts := strings.Split(string(p), f.Delimiter) + if len(parts) < f.Index { + msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) + if f.BestEffort { + log.Println(msg) + return nil, nil + } else { + return nil, fmt.Errorf(msg) + } + } + url := rxRelaxed.FindString(parts[f.Index-1]) + if f.Aggressive { + url = SanitizeURL(url) + } + if url == "" && f.SkipNonMatches { + return nil, nil + } + if len(f.AllowedSchemas) > 0 && !HasAnyPrefix(url, f.AllowedSchemas) { + return nil, nil + } + if len(parts) == 1 || f.Index == len(parts) { + url = url + "\n" + } + parts[f.Index-1] = url + return []byte(strings.Join(parts, f.Delimiter)), nil +} + +// FilterDOI is a line oriented DOI filter. +type FilterDOI struct { + Delimiter string + Index int + BestEffort bool + Aggressive bool + SkipNonMatches bool + AllowedSchema []string +} + +func (f *FilterDOI) Run(p []byte) ([]byte, error) { + parts := strings.Split(string(p), f.Delimiter) + if len(parts) < f.Index { + msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p)) + if f.BestEffort { + log.Println(msg) + return nil, nil + } else { + return nil, fmt.Errorf(msg) + } + } + doi := PatDOI.FindString(parts[f.Index-1]) + if doi == "" && f.SkipNonMatches { + return nil, nil + } + parts[f.Index-1] = strings.ToLower(doi) + return []byte(strings.Join(parts, f.Delimiter)), nil +} |