package skate import ( "index/suffixarray" "regexp" "strings" ) var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) ) // SanitizeURL applies various cleanup rules on URLs as found in references. // Returns an empty string when no URL could be constructed. Still, many // results will not be a URL after all. XXX: Sometimes a URL contains other // identifying information, like: // http://agingmind.utdallas.edu/facedb/view/neutral-faces.doi:10.3758/BF03206543. // http://10.3386/w20634https://doi.org/10.3386/w20634 func SanitizeURL(s string) string { var ( // seems to only be about 15% of total time spent index = suffixarray.New([]byte(s)) indices = index.Lookup([]byte("http"), -1) ) if !strings.HasPrefix(s, "http") && !strings.HasPrefix(s, "ftp") { s = sanitizeRaw(s, index) if s == "" { return s } } if len(indices) > 1 { s = s[0:indices[1]] // only use the first s = strings.TrimRight(s, ":") s = strings.TrimRight(s, ";") } // http://!!!: // http://! // http://" s = patNonWordDomain.ReplaceAllString(s, `$1$3`) // http:///en.m.wikipedia.org/ChenLong s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) // http://10.1113/jphysiol.2002.026047 s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) // .diaksestanggal27-03-2017.10.30Wib // accessedon15 // .Accessed // Acessoem:10/09/2012 // .Acesso:11Abr if patAccessedOn.MatchString(s) { s = patAccessedOn.ReplaceAllString(s, `$1`) } // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience if patFileExtraSuffix.MatchString(s) { s = patFileExtraSuffix.ReplaceAllString(s, `$1`) } return s } func sanitizeRaw(s string, index *suffixarray.Index) string { if len(s) < 4 { return "" } if !strings.Contains(s, ".") { return "" } indices := index.Lookup([]byte("www."), 1) if len(indices) > 0 { s = "http://" + s[indices[0]:] } else { s = "http://" + s } return s // Re-trievedfrom }