aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
Diffstat (limited to 'skate')
-rw-r--r--skate/url.go14
1 files changed, 9 insertions, 5 deletions
diff --git a/skate/url.go b/skate/url.go
index cb14754..91f0185 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -16,15 +16,19 @@ var (
patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`)
patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`)
- okSchemas = []string{"http://", "https://", "ftp://"}
+ // Note: technically, only "http" is the schema (https://stackoverflow.com/q/56297974/89391).
+ okPrefixes = []string{"http://", "https://", "ftp://"}
)
// SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in
-// references extracted with GROBID). Returns an empty string when no URL
-// could be discovered. Still, many results will not be a URL even after
-// sanitization.
+// references extracted with GROBID). Returns an empty string when no URL could
+// be discovered. Still, many results will not be valid links even after
+// sanitization. This is a surprisingly expensive operation, roughly processing
+// 20k urls/s, only. A short circuit with a successful url.Parse does not
+// really work, as syntactically valid URL strings may still be improbable
+// URLs, e.g. http://!!!x.com, etc.
func SanitizeURL(s string) string {
- if !hasAnyPrefix(s, okSchemas) {
+ if !hasAnyPrefix(s, okPrefixes) {
s = sanitizeRaw(s)
if s == "" {
return s