1 files changed, 9 insertions, 5 deletions
diff --git a/skate/url.go b/skate/url.go
index cb14754..91f0185 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -16,15 +16,19 @@ var (
 	patBrokenHttpSchema    = regexp.MustCompile(`^https?[^://]*?.*`)
 	patBrokenSchemaPrefix  = regexp.MustCompile(`(http|https)\W+(.*)`)
 
-	okSchemas = []string{"http://", "https://", "ftp://"}
+	// Note: technically, only "http" is the schema (https://stackoverflow.com/q/56297974/89391).
+	okPrefixes = []string{"http://", "https://", "ftp://"}
 )
 
 // SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in
-// references extracted with GROBID). Returns an empty string when no URL
-// could be discovered. Still, many results will not be a URL even after
-// sanitization.
+// references extracted with GROBID). Returns an empty string when no URL could
+// be discovered. Still, many results will not be valid links even after
+// sanitization. This is a surprisingly expensive operation, roughly processing
+// 20k urls/s, only. A short circuit with a successful url.Parse does not
+// really work, as syntactically valid URL strings may still be improbable
+// URLs, e.g.  http://!!!x.com, etc.
 func SanitizeURL(s string) string {
-	if !hasAnyPrefix(s, okSchemas) {
+	if !hasAnyPrefix(s, okPrefixes) {
 		s = sanitizeRaw(s)
 		if s == "" {
 			return s