aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd/skate-cleanup/main.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/cmd/skate-cleanup/main.go')
-rw-r--r--skate/cmd/skate-cleanup/main.go19
1 files changed, 11 insertions, 8 deletions
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go
index 1965f08..06a80a2 100644
--- a/skate/cmd/skate-cleanup/main.go
+++ b/skate/cmd/skate-cleanup/main.go
@@ -22,13 +22,14 @@ import (
)
var (
- numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
- batchSize = flag.Int("b", 100000, "batch size")
- delimiter = flag.String("d", "\t", "delimiter")
- index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed")
- bestEffort = flag.Bool("B", false, "only log errors, but do not stop")
- skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches")
- what = flag.String("c", "doi", "what to clean: doi, url")
+ numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers")
+ batchSize = flag.Int("b", 100000, "batch size")
+ delimiter = flag.String("d", "\t", "delimiter")
+ index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed")
+ bestEffort = flag.Bool("B", false, "only log errors, but do not stop")
+ skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches")
+ what = flag.String("c", "doi", "what to clean: doi, url")
+ extendedCleanup = flag.Bool("X", false, "extended cleanup for urls")
PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
rxRelaxed = xurls.Relaxed()
@@ -66,10 +67,12 @@ func urlFilter(p []byte) ([]byte, error) {
}
}
url := rxRelaxed.FindString(parts[*index-1])
+ if *extendedCleanup {
+ url = skate.SanitizeURL(url)
+ }
if url == "" && *skipNonMatches {
return nil, nil
}
- url = skate.SanitizeURL(url)
if len(parts) == 1 || *index == len(parts) {
url = url + "\n"
}