aboutsummaryrefslogtreecommitdiffstats
path: root/skate
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-06-09 22:53:06 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-06-09 22:53:06 +0200
commit1893399c5c98595facaa6161feda30813d637e5d (patch)
tree5a67a966207d4e0f0d254ddbed3260b5ab5fa592 /skate
parentb1f61657b5d042979578b6573aa67365dc551146 (diff)
downloadrefcat-1893399c5c98595facaa6161feda30813d637e5d.tar.gz
refcat-1893399c5c98595facaa6161feda30813d637e5d.zip
update docs
Diffstat (limited to 'skate')
-rw-r--r--skate/url.go14
1 files changed, 9 insertions, 5 deletions
diff --git a/skate/url.go b/skate/url.go
index cb14754..91f0185 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -16,15 +16,19 @@ var (
patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`)
patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`)
- okSchemas = []string{"http://", "https://", "ftp://"}
+ // Note: technically, only "http" is the schema (https://stackoverflow.com/q/56297974/89391).
+ okPrefixes = []string{"http://", "https://", "ftp://"}
)
// SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in
-// references extracted with GROBID). Returns an empty string when no URL
-// could be discovered. Still, many results will not be a URL even after
-// sanitization.
+// references extracted with GROBID). Returns an empty string when no URL could
+// be discovered. Still, many results will not be valid links even after
+// sanitization. This is a surprisingly expensive operation, roughly processing
+// 20k urls/s, only. A short circuit with a successful url.Parse does not
+// really work, as syntactically valid URL strings may still be improbable
+// URLs, e.g. http://!!!x.com, etc.
func SanitizeURL(s string) string {
- if !hasAnyPrefix(s, okSchemas) {
+ if !hasAnyPrefix(s, okPrefixes) {
s = sanitizeRaw(s)
if s == "" {
return s