update docs

author: Martin Czygan <martin.czygan@gmail.com> 2021-06-09 22:53:06 +0200
committer: Martin Czygan <martin.czygan@gmail.com> 2021-06-09 22:53:06 +0200
commit: 1893399c5c98595facaa6161feda30813d637e5d (patch)
tree: 5a67a966207d4e0f0d254ddbed3260b5ab5fa592
parent: b1f61657b5d042979578b6573aa67365dc551146 (diff)
download: refcat-1893399c5c98595facaa6161feda30813d637e5d.tar.gz
refcat-1893399c5c98595facaa6161feda30813d637e5d.zip
1 files changed, 9 insertions, 5 deletions
diff --git a/skate/url.go b/skate/url.go
index cb14754..91f0185 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -16,15 +16,19 @@ var (
 	patBrokenHttpSchema    = regexp.MustCompile(`^https?[^://]*?.*`)
 	patBrokenSchemaPrefix  = regexp.MustCompile(`(http|https)\W+(.*)`)
 
-	okSchemas = []string{"http://", "https://", "ftp://"}
+	// Note: technically, only "http" is the schema (https://stackoverflow.com/q/56297974/89391).
+	okPrefixes = []string{"http://", "https://", "ftp://"}
 )
 
 // SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in
-// references extracted with GROBID). Returns an empty string when no URL
-// could be discovered. Still, many results will not be a URL even after
-// sanitization.
+// references extracted with GROBID). Returns an empty string when no URL could
+// be discovered. Still, many results will not be valid links even after
+// sanitization. This is a surprisingly expensive operation, roughly processing
+// 20k urls/s, only. A short circuit with a successful url.Parse does not
+// really work, as syntactically valid URL strings may still be improbable
+// URLs, e.g.  http://!!!x.com, etc.
 func SanitizeURL(s string) string {
-	if !hasAnyPrefix(s, okSchemas) {
+	if !hasAnyPrefix(s, okPrefixes) {
 		s = sanitizeRaw(s)
 		if s == "" {
 			return s
author	Martin Czygan <martin.czygan@gmail.com>	2021-06-09 22:53:06 +0200
committer	Martin Czygan <martin.czygan@gmail.com>	2021-06-09 22:53:06 +0200
commit	1893399c5c98595facaa6161feda30813d637e5d (patch)
tree	5a67a966207d4e0f0d254ddbed3260b5ab5fa592
parent	b1f61657b5d042979578b6573aa67365dc551146 (diff)
download	refcat-1893399c5c98595facaa6161feda30813d637e5d.tar.gz refcat-1893399c5c98595facaa6161feda30813d637e5d.zip