package skate import ( "regexp" ) var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;]?(abgerufen|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|lastaccessed|acesso|accessoem|accessed).*$`) patFileExtraSuffix = regexp.MustCompile(`(http.*[.](zip|pdf|html|doc|docx|rar))(.*)$`) ) // SanitizeURL applies various cleanup rules on URLs as found in references. func SanitizeURL(s string) string { // http://!!!: // http://! // http://" s = patNonWordDomain.ReplaceAllString(s, `$1$3`) // http:///en.m.wikipedia.org/ChenLong s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) // http://10.1113/jphysiol.2002.026047 s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) // http://10.3386/w20634https://doi.org/10.3386/w20634 // .diaksestanggal27-03-2017.10.30Wib // accessedon15 // .Accessed // Acessoem:10/09/2012 // .Acesso:11Abr if patAccessedOn.MatchString(s) { s = patAccessedOn.ReplaceAllString(s, `$1`) } // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience if patFileExtraSuffix.MatchString(s) { s = patFileExtraSuffix.ReplaceAllString(s, `$1`) } return s }