diff options
Diffstat (limited to 'skate/url.go')
-rw-r--r-- | skate/url.go | 24 |
1 files changed, 20 insertions, 4 deletions
diff --git a/skate/url.go b/skate/url.go index f81b1d2..b59103f 100644 --- a/skate/url.go +++ b/skate/url.go @@ -12,8 +12,22 @@ var ( patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) + patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`) + patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`) + + okSchemas = []string{"http://", "https://", "ftp://"} ) +// hasAnyPrefixes returns true, if any of the prefixes matches string s. +func hasAnyPrefix(s string, prefix []string) bool { + for _, p := range prefix { + if strings.HasPrefix(s, p) { + return true + } + } + return false +} + // SanitizeURL applies various cleanup rules on URLs as found in references. // Returns an empty string when no URL could be constructed. Still, many // results will not be a URL after all. XXX: Sometimes a URL contains other @@ -21,7 +35,7 @@ var ( // http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. // http://10.3386/w20634https://doi.org/10.3386/w20634 func SanitizeURL(s string) string { - if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") { + if !hasAnyPrefix(s, okSchemas) { s = sanitizeRaw(s) if s == "" { return s @@ -70,10 +84,12 @@ func sanitizeRaw(s string) string { } indices := index.Lookup([]byte("www."), 1) if len(indices) > 0 { - s = "http://" + s[indices[0]:] - } else { - s = "http://" + s + return "http://" + s[indices[0]:] + } + if patBrokenSchemaPrefix.MatchString(s) { + return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`) } + s = "http://" + s return s // Re-trievedfrom } |