aboutsummaryrefslogtreecommitdiffstats
path: root/skate/cmd
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-10 02:09:06 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-10 02:09:06 +0200
commit4ac97dd20887a7adf97bdd6ec83e65ccd283b045 (patch)
tree5e3741149c7c909d177b6ed892e5b6eeae969521 /skate/cmd
parent88b5068171df640724482f9c77d5bec7d62ffa43 (diff)
downloadrefcat-4ac97dd20887a7adf97bdd6ec83e65ccd283b045.tar.gz
refcat-4ac97dd20887a7adf97bdd6ec83e65ccd283b045.zip
restrict url cleaning a bit more
Diffstat (limited to 'skate/cmd')
-rw-r--r--skate/cmd/skate-cleanup/main.go21
1 files changed, 19 insertions, 2 deletions
diff --git a/skate/cmd/skate-cleanup/main.go b/skate/cmd/skate-cleanup/main.go
index cf43732..033b0bc 100644
--- a/skate/cmd/skate-cleanup/main.go
+++ b/skate/cmd/skate-cleanup/main.go
@@ -30,13 +30,18 @@ var (
skipNonMatches = flag.Bool("S", false, "do not emit a line for non-matches")
what = flag.String("c", "doi", "what to clean: doi, url")
extendedCleanup = flag.Bool("X", false, "extended (and slower) cleanup for urls")
+ allow = flag.String("allow", "http,https", "comma separted list of schemas to allow for urls")
- PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
- rxRelaxed = xurls.Relaxed()
+ PatDOI = regexp.MustCompile(`10[.][0-9]{1,8}/[^ ]*[\w]`)
+ rxRelaxed = xurls.Relaxed()
+ allowedSchemas []string // parsed from allow flag
)
func main() {
flag.Parse()
+ for _, v := range strings.Split(*allow, ",") {
+ allowedSchemas = append(allowedSchemas, v)
+ }
var f func([]byte) ([]byte, error)
switch *what {
case "doi":
@@ -54,6 +59,15 @@ func main() {
}
}
+func hasAnyPrefix(s string, prefixes []string) bool {
+ for _, p := range prefixes {
+ if strings.HasPrefix(s, p) {
+ return true
+ }
+ }
+ return false
+}
+
// urlFilter parses finds the first URL.
func urlFilter(p []byte) ([]byte, error) {
parts := strings.Split(string(p), *delimiter)
@@ -73,6 +87,9 @@ func urlFilter(p []byte) ([]byte, error) {
if url == "" && *skipNonMatches {
return nil, nil
}
+ if len(allowedSchemas) > 0 && !hasAnyPrefix(url, allowedSchemas) {
+ return nil, nil
+ }
if len(parts) == 1 || *index == len(parts) {
url = url + "\n"
}