diff options
Diffstat (limited to 'skate')
-rw-r--r-- | skate/url.go | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/skate/url.go b/skate/url.go index cb14754..91f0185 100644 --- a/skate/url.go +++ b/skate/url.go @@ -16,15 +16,19 @@ var ( patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`) patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`) - okSchemas = []string{"http://", "https://", "ftp://"} + // Note: technically, only "http" is the schema (https://stackoverflow.com/q/56297974/89391). + okPrefixes = []string{"http://", "https://", "ftp://"} ) // SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in -// references extracted with GROBID). Returns an empty string when no URL -// could be discovered. Still, many results will not be a URL even after -// sanitization. +// references extracted with GROBID). Returns an empty string when no URL could +// be discovered. Still, many results will not be valid links even after +// sanitization. This is a surprisingly expensive operation, roughly processing +// 20k urls/s, only. A short circuit with a successful url.Parse does not +// really work, as syntactically valid URL strings may still be improbable +// URLs, e.g. http://!!!x.com, etc. func SanitizeURL(s string) string { - if !hasAnyPrefix(s, okSchemas) { + if !hasAnyPrefix(s, okPrefixes) { s = sanitizeRaw(s) if s == "" { return s |