aboutsummaryrefslogtreecommitdiffstats
path: root/skate/url.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/url.go')
-rw-r--r--skate/url.go24
1 files changed, 20 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go
index f81b1d2..b59103f 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -12,8 +12,22 @@ var (
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
+ patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`)
+ patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`)
+
+ okSchemas = []string{"http://", "https://", "ftp://"}
)
+// hasAnyPrefixes returns true, if any of the prefixes matches string s.
+func hasAnyPrefix(s string, prefix []string) bool {
+ for _, p := range prefix {
+ if strings.HasPrefix(s, p) {
+ return true
+ }
+ }
+ return false
+}
+
// SanitizeURL applies various cleanup rules on URLs as found in references.
// Returns an empty string when no URL could be constructed. Still, many
// results will not be a URL after all. XXX: Sometimes a URL contains other
@@ -21,7 +35,7 @@ var (
// http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543.
// http://10.3386/w20634https://doi.org/10.3386/w20634
func SanitizeURL(s string) string {
- if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") {
+ if !hasAnyPrefix(s, okSchemas) {
s = sanitizeRaw(s)
if s == "" {
return s
@@ -70,10 +84,12 @@ func sanitizeRaw(s string) string {
}
indices := index.Lookup([]byte("www."), 1)
if len(indices) > 0 {
- s = "http://" + s[indices[0]:]
- } else {
- s = "http://" + s
+ return "http://" + s[indices[0]:]
+ }
+ if patBrokenSchemaPrefix.MatchString(s) {
+ return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`)
}
+ s = "http://" + s
return s
// Re-trievedfrom
}