diff options
Diffstat (limited to 'skate/url.go')
-rw-r--r-- | skate/url.go | 25 |
1 files changed, 24 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go index f0edddf..66b9312 100644 --- a/skate/url.go +++ b/skate/url.go @@ -11,7 +11,7 @@ var ( patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) - patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`) + patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) ) // SanitizeURL applies various cleanup rules on URLs as found in references. @@ -24,6 +24,12 @@ func SanitizeURL(s string) string { index = suffixarray.New([]byte(s)) indices = index.Lookup([]byte("http"), -1) ) + if !strings.HasPrefix(s, "http") { + s = sanitizeRaw(s, index) + if s == "" { + return s + } + } if len(indices) > 1 { s = s[0:indices[1]] // only use the first s = strings.TrimRight(s, ":") @@ -51,3 +57,20 @@ func SanitizeURL(s string) string { } return s } + +func sanitizeRaw(s string, index *suffixarray.Index) string { + if len(s) < 4 { + return "" + } + if !strings.Contains(s, ".") { + return "" + } + indices := index.Lookup([]byte("www."), 1) + if len(indices) > 0 { + s = "http://" + s[indices[0]:] + } else { + s = "http://" + s + } + return s + // Re-trievedfrom +} |