diff options
Diffstat (limited to 'skate/url.go')
| -rw-r--r-- | skate/url.go | 25 | 
1 files changed, 24 insertions, 1 deletions
diff --git a/skate/url.go b/skate/url.go index f0edddf..66b9312 100644 --- a/skate/url.go +++ b/skate/url.go @@ -11,7 +11,7 @@ var (  	patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`)  	patHttpDOI             = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`)  	patAccessedOn          = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) -	patFileExtraSuffix     = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))(.*)$`) +	patFileExtraSuffix     = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`)  )  // SanitizeURL applies various cleanup rules on URLs as found in references. @@ -24,6 +24,12 @@ func SanitizeURL(s string) string {  		index   = suffixarray.New([]byte(s))  		indices = index.Lookup([]byte("http"), -1)  	) +	if !strings.HasPrefix(s, "http") { +		s = sanitizeRaw(s, index) +		if s == "" { +			return s +		} +	}  	if len(indices) > 1 {  		s = s[0:indices[1]] // only use the first  		s = strings.TrimRight(s, ":") @@ -51,3 +57,20 @@ func SanitizeURL(s string) string {  	}  	return s  } + +func sanitizeRaw(s string, index *suffixarray.Index) string { +	if len(s) < 4 { +		return "" +	} +	if !strings.Contains(s, ".") { +		return "" +	} +	indices := index.Lookup([]byte("www."), 1) +	if len(indices) > 0 { +		s = "http://" + s[indices[0]:] +	} else { +		s = "http://" + s +	} +	return s +	// Re-trievedfrom +}  | 
