diff options
author | Martin Czygan <martin.czygan@gmail.com> | 2021-06-09 22:53:06 +0200 |
---|---|---|
committer | Martin Czygan <martin.czygan@gmail.com> | 2021-06-09 22:53:06 +0200 |
commit | 1893399c5c98595facaa6161feda30813d637e5d (patch) | |
tree | 5a67a966207d4e0f0d254ddbed3260b5ab5fa592 | |
parent | b1f61657b5d042979578b6573aa67365dc551146 (diff) | |
download | refcat-1893399c5c98595facaa6161feda30813d637e5d.tar.gz refcat-1893399c5c98595facaa6161feda30813d637e5d.zip |
update docs
-rw-r--r-- | skate/url.go | 14 |
1 files changed, 9 insertions, 5 deletions
diff --git a/skate/url.go b/skate/url.go index cb14754..91f0185 100644 --- a/skate/url.go +++ b/skate/url.go @@ -16,15 +16,19 @@ var ( patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`) patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`) - okSchemas = []string{"http://", "https://", "ftp://"} + // Note: technically, only "http" is the schema (https://stackoverflow.com/q/56297974/89391). + okPrefixes = []string{"http://", "https://", "ftp://"} ) // SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in -// references extracted with GROBID). Returns an empty string when no URL -// could be discovered. Still, many results will not be a URL even after -// sanitization. +// references extracted with GROBID). Returns an empty string when no URL could +// be discovered. Still, many results will not be valid links even after +// sanitization. This is a surprisingly expensive operation, roughly processing +// 20k urls/s, only. A short circuit with a successful url.Parse does not +// really work, as syntactically valid URL strings may still be improbable +// URLs, e.g. http://!!!x.com, etc. func SanitizeURL(s string) string { - if !hasAnyPrefix(s, okSchemas) { + if !hasAnyPrefix(s, okPrefixes) { s = sanitizeRaw(s) if s == "" { return s |