aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cleanup.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cleanup.go')
-rw-r--r--skate/cleanup.go78
1 files changed, 78 insertions, 0 deletions
diff --git a/skate/cleanup.go b/skate/cleanup.go
new file mode 100644
index 0000000..b50e048
--- /dev/null
+++ b/skate/cleanup.go
@@ -0,0 +1,78 @@
+package skate
+
+import (
+ "fmt"
+ "log"
+ "strings"
+
+ "mvdan.cc/xurls/v2"
+)
+
+var rxRelaxed = xurls.Relaxed()
+
+// URLFilter is a line oriented URL filter.
+type FilterURL struct {
+ Delimiter string
+ Index int
+ BestEffort bool
+ Aggressive bool
+ SkipNonMatches bool
+ AllowedSchemas []string
+}
+
+func (f *FilterURL) Run(p []byte) ([]byte, error) {
+ parts := strings.Split(string(p), f.Delimiter)
+ if len(parts) < f.Index {
+ msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
+ if f.BestEffort {
+ log.Println(msg)
+ return nil, nil
+ } else {
+ return nil, fmt.Errorf(msg)
+ }
+ }
+ url := rxRelaxed.FindString(parts[f.Index-1])
+ if f.Aggressive {
+ url = SanitizeURL(url)
+ }
+ if url == "" && f.SkipNonMatches {
+ return nil, nil
+ }
+ if len(f.AllowedSchemas) > 0 && !HasAnyPrefix(url, f.AllowedSchemas) {
+ return nil, nil
+ }
+ if len(parts) == 1 || f.Index == len(parts) {
+ url = url + "\n"
+ }
+ parts[f.Index-1] = url
+ return []byte(strings.Join(parts, f.Delimiter)), nil
+}
+
+// FilterDOI is a line oriented DOI filter.
+type FilterDOI struct {
+ Delimiter string
+ Index int
+ BestEffort bool
+ Aggressive bool
+ SkipNonMatches bool
+ AllowedSchema []string
+}
+
+func (f *FilterDOI) Run(p []byte) ([]byte, error) {
+ parts := strings.Split(string(p), f.Delimiter)
+ if len(parts) < f.Index {
+ msg := fmt.Sprintf("warn: line has too few fields (%d): %s", len(parts), string(p))
+ if f.BestEffort {
+ log.Println(msg)
+ return nil, nil
+ } else {
+ return nil, fmt.Errorf(msg)
+ }
+ }
+ doi := PatDOI.FindString(parts[f.Index-1])
+ if doi == "" && f.SkipNonMatches {
+ return nil, nil
+ }
+ parts[f.Index-1] = strings.ToLower(doi)
+ return []byte(strings.Join(parts, f.Delimiter)), nil
+}