diff options
-rw-r--r-- | skate/cmd/skate-cleanup/main.go | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go index 1965f08..06a80a2 100644 --- a/skate/cmd/skate-cleanup/main.go +++ b/skate/cmd/skate-cleanup/main.go @@ -22,13 +22,14 @@ import ( ) var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 100000, "batch size") - delimiter = flag.String("d", "\t", "delimiter") - index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") - bestEffort = flag.Bool("B", false, "only log errors, but do not stop") - skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") - what = flag.String("c", "doi", "what to clean: doi, url") + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + delimiter = flag.String("d", "\t", "delimiter") + index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") + bestEffort = flag.Bool("B", false, "only log errors, but do not stop") + skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") + what = flag.String("c", "doi", "what to clean: doi, url") + extendedCleanup = flag.Bool("X", false, "extended cleanup for urls") PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) rxRelaxed = xurls.Relaxed() @@ -66,10 +67,12 @@ func urlFilter(p []byte) ([]byte, error) { } } url := rxRelaxed.FindString(parts[*index-1]) + if *extendedCleanup { + url = skate.SanitizeURL(url) + } if url == "" && *skipNonMatches { return nil, nil } - url = skate.SanitizeURL(url) if len(parts) == 1 || *index == len(parts) { url = url + "\n" } |