diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-10 02:09:06 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-10 02:09:06 +0200 |
commit | 4ac97dd20887a7adf97bdd6ec83e65ccd283b045 (patch) | |
tree | 5e3741149c7c909d177b6ed892e5b6eeae969521 /skate/cmd | |
parent | 88b5068171df640724482f9c77d5bec7d62ffa43 (diff) | |
download | refcat-4ac97dd20887a7adf97bdd6ec83e65ccd283b045.tar.gz refcat-4ac97dd20887a7adf97bdd6ec83e65ccd283b045.zip |
restrict url cleaning a bit more
Diffstat (limited to 'skate/cmd')
-rw-r--r-- | skate/cmd/skate-cleanup/main.go | 21 |
1 files changed, 19 insertions, 2 deletions
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go index cf43732..033b0bc 100644 --- a/skate/cmd/skate-cleanup/main.go +++ b/skate/cmd/skate-cleanup/main.go @@ -30,13 +30,18 @@ var ( skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches") what = flag.String("c", "doi", "what to clean: doi, url") extendedCleanup = flag.Bool("X", false, "extended (and slower) cleanup for urls") + allow = flag.String("allow", "http,https", "comma separted list of schemas to allow for urls") - PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) - rxRelaxed = xurls.Relaxed() + PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`) + rxRelaxed = xurls.Relaxed() + allowedSchemas []string // parsed from allow flag ) func main() { flag.Parse() + for _, v := range strings.Split(*allow, ",") { + allowedSchemas = append(allowedSchemas, v) + } var f func([]byte) ([]byte, error) switch *what { case "doi": @@ -54,6 +59,15 @@ func main() { } } +func hasAnyPrefix(s string, prefixes []string) bool { + for _, p := range prefixes { + if strings.HasPrefix(s, p) { + return true + } + } + return false +} + // urlFilter parses finds the first URL. func urlFilter(p []byte) ([]byte, error) { parts := strings.Split(string(p), *delimiter) @@ -73,6 +87,9 @@ func urlFilter(p []byte) ([]byte, error) { if url == "" && *skipNonMatches { return nil, nil } + if len(allowedSchemas) > 0 && !hasAnyPrefix(url, allowedSchemas) { + return nil, nil + } if len(parts) == 1 || *index == len(parts) { url = url + "\n" } |