aboutsummaryrefslogtreecommitdiffstats
path: root/skate/url.go
diff options
context:
space:
mode:
Diffstat (limited to 'skate/url.go')
-rw-r--r--skate/url.go25
1 files changed, 24 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go
index f0edddf..66b9312 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -11,7 +11,7 @@ var (
patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
- patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`)
+ patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
)
// SanitizeURL applies various cleanup rules on URLs as found in references.
@@ -24,6 +24,12 @@ func SanitizeURL(s string) string {
index = suffixarray.New([]byte(s))
indices = index.Lookup([]byte("http"), -1)
)
+ if !strings.HasPrefix(s, "http") {
+ s = sanitizeRaw(s, index)
+ if s == "" {
+ return s
+ }
+ }
if len(indices) > 1 {
s = s[0:indices[1]] // only use the first
s = strings.TrimRight(s, ":")
@@ -51,3 +57,20 @@ func SanitizeURL(s string) string {
}
return s
}
+
+func sanitizeRaw(s string, index *suffixarray.Index) string {
+ if len(s) < 4 {
+ return ""
+ }
+ if !strings.Contains(s, ".") {
+ return ""
+ }
+ indices := index.Lookup([]byte("www."), 1)
+ if len(indices) > 0 {
+ s = "http://" + s[indices[0]:]
+ } else {
+ s = "http://" + s
+ }
+ return s
+ // Re-trievedfrom
+}