package skate import ( "index/suffixarray" "regexp" "sort" "strings" ) var ( patNonWordDomain = regexp.MustCompile(`(https?:\/\/)([^\w]*)(.*)`) patRepeatedHttpSlashes = regexp.MustCompile(`(https?:\/\/)(\/)*(.*)`) patHttpDOI = regexp.MustCompile(`(https?:\/\/)(10[.][0-9]{1,8}\/.*)`) patAccessedOn = regexp.MustCompile(`(?iU)(.*)[.;,]?(abgerufen|acces-sed|últimoacesso|updated|ac-cessed|zugegriffen|diunduh|adresinden|sayfasındanulaşıl|accessedon|consultéle|consultad|diaksestanggal|diaksespadatanggal|diakses|dateofaccess|lastaccessed|acesso|acessado|accessoem|accessed|recibido|accessedat|доступ-свободный|датаобращения|pageconsulté|indirme|downloadedfrom).*$`) patFileExtraSuffix = regexp.MustCompile(`(?i)(http.*[.](pdf\+html|zip|pdf|html|doc|docx|rar))([^?]*)$`) patBrokenHttpSchema = regexp.MustCompile(`^https?[^://]*?.*`) patBrokenSchemaPrefix = regexp.MustCompile(`(http|https)\W+(.*)`) // Note: [...] and naming things, https://stackoverflow.com/q/56297974/89391. okPrefixes = []string{"http://", "https://", "ftp://"} ) // SanitizeURL applies various cleanup rules on URLs (as they are found e.g. in // references extracted with GROBID). Returns an empty string when no URL could // be discovered. Still, many results will not be valid links even after // sanitization. This is a surprisingly expensive operation, roughly processing // 20k urls/s, only. A short circuit with a successful url.Parse does not // really work, as syntactically valid URL strings may still be improbable // URLs, e.g. http://!!!x.com, etc. func SanitizeURL(s string) string { if !HasAnyPrefix(s, okPrefixes) { s = sanitizeRaw(s) if s == "" { return s } } index := suffixarray.New([]byte(s)) // seems to only be about 15% of total time spent indices := index.Lookup([]byte("http"), -1) if len(indices) == 1 { // ISSN-2177-4129periodicos.ufpel.edu.br/ojs2/index.php/Memoriahttp://dx.doi.org/10.15210/rmr.v8i14.7485 s = s[indices[0]:] } else if len(indices) > 1 { sort.Ints(indices) // http://ailab.ist.psu.edu/bcpred/SVMTriP:http://sysbio.unl.edu/SVMTriP/prediction.phpBcell s = s[indices[0]:indices[1]] s = strings.TrimRight(s, ":") s = strings.TrimRight(s, ";") } // http://!!!:, // http://!, // http://" s = patNonWordDomain.ReplaceAllString(s, `$1$3`) // http:///en.m.wikipedia.org/ChenLong s = patRepeatedHttpSlashes.ReplaceAllString(s, `$1$3`) // http://10.1113/jphysiol.2002.026047 s = patHttpDOI.ReplaceAllString(s, `https://doi.org/$2`) // .Acesso:11Abr, accessedon15, ... if patAccessedOn.MatchString(s) { s = patAccessedOn.ReplaceAllString(s, `$1`) } // http://140.120.197.173/Ecology/Download/Timing-MSChart.zipJournalofInsectScience if patFileExtraSuffix.MatchString(s) { s = patFileExtraSuffix.ReplaceAllString(s, `$1`) } return s } func sanitizeRaw(s string) string { if len(s) < 4 { return "" } if !strings.Contains(s, ".") { return "" } index := suffixarray.New([]byte(s)) indices := index.Lookup([]byte("www."), 1) sort.Ints(indices) if len(indices) > 0 { return "http://" + s[indices[0]:] } if patBrokenSchemaPrefix.MatchString(s) { return patBrokenSchemaPrefix.ReplaceAllString(s, `http://$2`) } return "http://" + s } // HasAnyPrefixes returns true, if any of the prefixes matches string s. func HasAnyPrefix(s string, prefix []string) bool { for _, p := range prefix { if strings.HasPrefix(s, p) { return true } } return false }