diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 23:10:34 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-05-12 23:10:36 +0200 |
commit | 71f8167681218a74bee98f0bb7ff0123581ccb49 (patch) | |
tree | 9addef8033c9078547e3d426f512efa3e18ade9b /skate | |
parent | 1cf5e32e9a07d594d01dda210717cf799c32b3a2 (diff) | |
download | refcat-71f8167681218a74bee98f0bb7ff0123581ccb49.tar.gz refcat-71f8167681218a74bee98f0bb7ff0123581ccb49.zip |
skate-cleanup: add -X flag
Diffstat (limited to 'skate')
-rw-r--r-- | skate/cmd/skate-cleanup/main.go | 19 |
1 files changed, 11 insertions, 8 deletions
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go index 1965f08..06a80a2 100644 --- a/skate/cmd/skate-cleanup/main.go +++ b/skate/cmd/skate-cleanup/main.go @@ -22,13 +22,14 @@ import ( ) var ( - numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") - batchSize = flag.Int("b", 100000, "batch size") - delimiter = flag.String("d", "\t", "delimiter") - index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") - bestEffort = flag.Bool("B", false, "only log errors, but do not stop") - skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") - what = flag.String("c", "doi", "what to clean: doi, url") + numWorkers = flag.Int("w", runtime.NumCPU(), "number of workers") + batchSize = flag.Int("b", 100000, "batch size") + delimiter = flag.String("d", "\t", "delimiter") + index = flag.Int("f", 1, "one field to cleanup up a doi, 1-indexed") + bestEffort = flag.Bool("B", false, "only log errors, but do not stop") + skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") + what = flag.String("c", "doi", "what to clean: doi, url") + extendedCleanup = flag.Bool("X", false, "extended cleanup for urls") PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) rxRelaxed = xurls.Relaxed() @@ -66,10 +67,12 @@ func urlFilter(p []byte) ([]byte, error) { } } url := rxRelaxed.FindString(parts[*index-1]) + if *extendedCleanup { + url = skate.SanitizeURL(url) + } if url == "" && *skipNonMatches { return nil, nil } - url = skate.SanitizeURL(url) if len(parts) == 1 || *index == len(parts) { url = url + "\n" } |