1 files changed, 24 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go
index f0edddf..66b9312 100644
--- a/skate/url.go
+++ b/skate/url.go
@@ -11,7 +11,7 @@ var (
 	patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)
 	patHttpDOI             = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)
 	patAccessedOn          = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`)
-	patFileExtraSuffix     = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`)
+	patFileExtraSuffix     = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)
 )
 
 // SanitizeURL applies various cleanup rules on URLs as found in references.
@@ -24,6 +24,12 @@ func SanitizeURL(s string) string {
 		index   = suffixarray.New([]byte(s))
 		indices = index.Lookup([]byte("http"), -1)
 	)
+	if !strings.HasPrefix(s, "http") {
+		s = sanitizeRaw(s, index)
+		if s == "" {
+			return s
+		}
+	}
 	if len(indices) > 1 {
 		s = s[0:indices[1]] // only use the first
 		s = strings.TrimRight(s, ":")
@@ -51,3 +57,20 @@ func SanitizeURL(s string) string {
 	}
 	return s
 }
+
+func sanitizeRaw(s string, index *suffixarray.Index) string {
+	if len(s) < 4 {
+		return ""
+	}
+	if !strings.Contains(s, ".") {
+		return ""
+	}
+	indices := index.Lookup([]byte("www."), 1)
+	if len(indices) > 0 {
+		s = "http://" + s[indices[0]:]
+	} else {
+		s = "http://" + s
+	}
+	return s
+	// Re-trievedfrom
+}