package skate import ( "index/suffixarray" "regexp" "strings" ) var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`) patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`) okSchemas = []string{"http://", "https://", "ftp://"} ) // SanitizeURL applies various cleanup rules on URLs (as they are found in // references extracted with GROBID). Returns an empty string when no URL // could be discovered. Still, many results will not be a URL even after // sanitization. func SanitizeURL(s string) string { if !hasAnyPrefix(s, okSchemas) { s = sanitizeRaw(s) if s == "" { return s } } var ( // seems to only be about 15% of total time spent index = suffixarray.New([]byte(s)) indices = index.Lookup([]byte("http"), -1) ) if len(indices) == 1 { // ISSN-2177-4129periodicos.ufpel.edu.br/ojs2/index.php/Memoriahttp://dx.doi.org/10.15210/rmr.v8i14.7485 s = s[indices[0]:] } else if len(indices) > 1 { // http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell s = s[indices[0]:indices[1]] // only use the first s = strings.TrimRight(s, ":") s = strings.TrimRight(s, ";") } // http://!!!:, // http://!, // http://" s = patNonWordDomain.ReplaceAllString(s, `$1$3`) // http:///en.m.wikipedia.org/ChenLong s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) // http://10.1113/jphysiol.2002.026047 s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) // .Acesso:11Abr, accessedon15, ... if patAccessedOn.MatchString(s) { s = patAccessedOn.ReplaceAllString(s, `$1`) } // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience if patFileExtraSuffix.MatchString(s) { s = patFileExtraSuffix.ReplaceAllString(s, `$1`) } return s } func sanitizeRaw(s string) string { if len(s) < 4 { return "" } if !strings.Contains(s, ".") { return "" } var ( index = suffixarray.New([]byte(s)) indices = index.Lookup([]byte("www."), 1) ) if len(indices) > 0 { return "http://" + s[indices[0]:] } if patBrokenSchemaPrefix.MatchString(s) { return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`) } return "http://" + s } // hasAnyPrefixes returns true, if any of the prefixes matches string s. func hasAnyPrefix(s string, prefix []string) bool { for _, p := range prefix { if strings.HasPrefix(s, p) { return true } } return false }